In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, classification_report

In [None]:
# Train_data input using pandas dataframes

data = pd.read_csv('/content/iith_foml_2023_train.csv')  ## Change the input file path accordingly

data.head()

Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,...,Feature 16,Feature 17,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24,Target Variable (Discrete)
0,1404,12,64,14,3,1,1,1,110.502,35775.2,...,,,15.04,104,12,2,32,1409,37677.1,1
1,909,0,235,32,1,1,1,1,-40.448,35779.4,...,2200.3,4900.005,12.03,20,1,0,13,909,25239.1,1
2,654,3,175,2,1,1,1,1,-27.445,35770.4,...,1973.3,10000.004,13.01,1,1,0,13,654,27683.5,1
3,1372,12,382,14,2,0,1,0,0.001,509.2,...,,,,313,12,10,54,1377,39363.2,0
4,786,3,199,2,1,0,1,0,0.001,612.1,...,,,,171,1,5,11,786,40044.4,2


In [None]:
data.shape     # Shape(size) of the input

(994, 25)

In [None]:
data.info()     # Check if the null values are present or not

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Feature 1 (Discrete)        994 non-null    int64  
 1   Feature 2 (Discrete)        994 non-null    int64  
 2   Feature 3 (Discrete)        994 non-null    int64  
 3   Feature 4 (Discrete)        994 non-null    int64  
 4   Feature 5 (Discrete)        994 non-null    int64  
 5   Feature 6 (Discrete)        994 non-null    int64  
 6   Feature 7 (Discrete)        994 non-null    int64  
 7   Feature 8 (Discrete)        994 non-null    int64  
 8   Feature 9                   980 non-null    float64
 9   Feature 10                  993 non-null    float64
 10  Feature 11                  993 non-null    float64
 11  Feature 12                  993 non-null    float64
 12  Feature 13                  993 non-null    float64
 13  Feature 14                  993 non

In [None]:
## Filling null values using simple imputer with mean strategy

imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [None]:
data_imputed.info()      ## Checking if all the null values are filled

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Feature 1 (Discrete)        994 non-null    float64
 1   Feature 2 (Discrete)        994 non-null    float64
 2   Feature 3 (Discrete)        994 non-null    float64
 3   Feature 4 (Discrete)        994 non-null    float64
 4   Feature 5 (Discrete)        994 non-null    float64
 5   Feature 6 (Discrete)        994 non-null    float64
 6   Feature 7 (Discrete)        994 non-null    float64
 7   Feature 8 (Discrete)        994 non-null    float64
 8   Feature 9                   994 non-null    float64
 9   Feature 10                  994 non-null    float64
 10  Feature 11                  994 non-null    float64
 11  Feature 12                  994 non-null    float64
 12  Feature 13                  994 non-null    float64
 13  Feature 14                  994 non

In [None]:
X = data_imputed.drop('Target Variable (Discrete)', axis=1)     ## Storing input features in X
y = data_imputed['Target Variable (Discrete)']                  ## Storing the target varible in y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   ## Spliting the data into train and test data

In [None]:
clf = DecisionTreeClassifier()       ## Implementing Decision Tree Classifier
clf.fit(X_train, y_train)            ## Training the model with Xtrain and Y train

predictions = clf.predict(X_test)     ## Predicting the values of Xtest

In [None]:
# Calculate the Macro F1 score
macro_f1 = f1_score(y_test, predictions, average='macro')

# Print the results
print(f'Macro F1 Score: {macro_f1:.2f}')
print('Classification Report:\n', classification_report(y_test, predictions))

Macro F1 Score: 0.44
Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.94      0.88        50
         1.0       0.94      0.97      0.95        91
         2.0       0.73      0.62      0.67        26
         3.0       0.00      0.00      0.00         1
         4.0       1.00      1.00      1.00         1
         5.0       0.33      0.38      0.35         8
         6.0       0.91      0.91      0.91        11
         7.0       0.00      0.00      0.00         2
         8.0       0.00      0.00      0.00         5
        11.0       0.00      0.00      0.00         1
        13.0       0.00      0.00      0.00         1
        14.0       1.00      1.00      1.00         1
        15.0       0.00      0.00      0.00         1

    accuracy                           0.83       199
   macro avg       0.44      0.45      0.44       199
weighted avg       0.80      0.83      0.82       199



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# test data

In [None]:
## Test_data input using pandas dataframes

test_data = pd.read_csv('/content/iith_foml_2023_test.csv')
test_data.head()

Unnamed: 0,Feature 1 (Discrete),Feature 2 (Discrete),Feature 3 (Discrete),Feature 4 (Discrete),Feature 5 (Discrete),Feature 6 (Discrete),Feature 7 (Discrete),Feature 8 (Discrete),Feature 9,Feature 10,...,Feature 15,Feature 16,Feature 17,Feature 18,Feature 19 (Discrete),Feature 20 (Discrete),Feature 21 (Discrete),Feature 22 (Discrete),Feature 23 (Discrete),Feature 24
0,146,12,42,14,7,1,1,1,118.004,35693.5,...,4200.3,1900.1,,,44,12,2,42,146,37384.5
1,35,0,12,5,0,0,1,0,0.001,471.5,...,531.4,,,,17,6,8,0,35,41465.1
2,1018,8,259,2,1,1,1,1,,35774.5,...,5514.2,,,15.04,1,1,3,20,1018,37826.2
3,383,7,117,5,1,1,1,1,53.002,34094.1,...,3358.4,1100.2,14000.001,15.04,101,6,3,20,383,40277.3
4,1216,7,40,5,2,0,1,4,0.005,1471.3,...,225.1,,,3.02,276,6,7,43,1221,28419.5


In [None]:
test_data.shape    # Shape(size) of test data

(426, 24)

In [None]:
test_data.info()   # checking if null values present or not

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 0 to 425
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Feature 1 (Discrete)   426 non-null    int64  
 1   Feature 2 (Discrete)   426 non-null    int64  
 2   Feature 3 (Discrete)   426 non-null    int64  
 3   Feature 4 (Discrete)   426 non-null    int64  
 4   Feature 5 (Discrete)   426 non-null    int64  
 5   Feature 6 (Discrete)   426 non-null    int64  
 6   Feature 7 (Discrete)   426 non-null    int64  
 7   Feature 8 (Discrete)   426 non-null    int64  
 8   Feature 9              422 non-null    float64
 9   Feature 10             426 non-null    float64
 10  Feature 11             426 non-null    float64
 11  Feature 12             426 non-null    float64
 12  Feature 13             426 non-null    float64
 13  Feature 14             424 non-null    float64
 14  Feature 15             395 non-null    float64
 15  Featur

In [None]:
## Filling null values using simple imputer with mean strategy

imputer = SimpleImputer(strategy='mean')
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)

In [None]:
X_test = test_data_imputed.copy()

In [None]:
test_predictions = clf.predict(X_test)    ## predicting the values of test data using the above decision tree model that was trained above

In [None]:
output_df = pd.DataFrame({'Id': range(1, len(test_predictions) + 1), 'Category': test_predictions.astype(int)})    ## column 1 and id and columnn 2 as the predicted values

In [None]:
output_df.to_csv('test_output.csv', index=False)  ## Storing the predicted output in a csv file

In [None]:
## Checking the data of the output file

test_data_output = pd.read_csv('/content/test_output.csv')
test_data_output.head()

Unnamed: 0,Id,Category
0,1,6
1,2,2
2,3,1
3,4,1
4,5,1


In [None]:
test_data_output.shape        ## Checking if the size output file (test data and the output is same or not)

(426, 2)