<a href="https://colab.research.google.com/github/KarthikAlagarsamy/AIQoD/blob/main/Karthik_AIQoD_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multi-Output Classification modelling

In [None]:
# Import necessary libraries

import pandas as pd                                                 # for data manipulation and analysis
from sklearn.impute import SimpleImputer, KNNImputer                # for handling missing values
from sklearn.preprocessing import LabelEncoder, StandardScaler      # for encoding categorical labels and standardize features
from sklearn.feature_extraction.text import TfidfVectorizer         # for converting a collection of text documents to matrix
from sklearn.ensemble import RandomForestClassifier                 # for building a random forest classifier
from sklearn.multioutput import MultiOutputClassifier               # for handling multi-output classification tasks
import scipy.sparse as sp                                           # for handling sparse matrices

In [None]:
# Load the data
train_df = pd.read_csv('train.csv')
train_labels_df = pd.read_csv('trainLabels.csv')
test_df = pd.read_csv('test.csv')

# Inspect the data
print(train_df.head())
print(train_labels_df.head())
print(test_df.head())

   id   x1   x2                                            x3  \
0   1   NO   NO  dqOiM6yBYgnVSezBRiQXs9bvOFnRqrtIoXRIElxD7g8=   
1   2  NaN  NaN                                           NaN   
2   3   NO   NO  ib4VpsEsqJHzDiyL0dZLQ+xQzDPrkxE+9T3mx5fv2wI=   
3   4  YES   NO  BfrqME7vdLw3suQp6YAT16W2piNUmpKhMzuDrVrFQ4w=   
4   5   NO   NO  RTjsrrR8DTlJyaIP9Q3Z8s0zseqlVQTrlSe97GCWfbk=   

                                             x4        x5        x6        x7  \
0  GNjrXXA3SxbgD0dTRblAPO9jFJ7AIaZnu/f48g5XSUk=  0.576561  0.073139  0.481394   
1                                           NaN  0.000000  0.000000  0.000000   
2  X6dDAI/DZOWvu0Dg6gCgRoNr2vTUz/mc4SdHTNUPS38=  1.341803  0.051422  0.935572   
3  YGCdISifn4fLao/ASKdZFhGIq23oqzfSbUVb6px1pig=  0.653912  0.041471  0.940787   
4  3yK2OPj1uYDsoMgsxsjY1FxXkOllD8Xfh20VYGqT+nU=  1.415919  0.000000  1.000000   

         x8        x9  ... x136   x137  x138  x139 x140  x141  x142  x143  \
0  0.115697  0.472474  ...  0.0  0.810  3306 

In [None]:
# Assign column names of train_df to test_df

# Read first row of train data CSV
train_first_row = pd.read_csv("train.csv", nrows=1)

# Create new DataFrame with first row
test_with_header_df = pd.DataFrame(columns=train_first_row.columns)

# Write first row to modified test data CSV
test_with_header_df.to_csv("test_with_header.csv", index=False)

# Append first row of original test data CSV to modified test data CSV
with open("test.csv", "r") as f:
    first_row = f.readline()                          # Read the first row
    with open("test_with_header.csv", "a") as f_out:
        f_out.write(first_row)                        # Append the first row

        # Append remaining rows of original test data CSV to modified test data CSV
        for line in f:
            f_out.write(line)

# Convert modified test data CSV into DataFrame
test_df = pd.read_csv('/content/test_with_header.csv')
test_df

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,1698001,NO,NO,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.414798,0.000000,1.000000,0.000000,0.202060,...,0.0,1.000000,1262,892,NO,NO,NO,0,0.089686,0.193344
1,1698002,NO,NO,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,...,1.0,0.866667,4672,3311,NO,NO,NO,5,0.945032,0.471318
2,1698003,NO,NO,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.000000,1.000000,0.000000,0.703088,...,-1.0,1.000000,1263,892,NO,NO,NO,8,0.557175,0.693587
3,1698004,,,,,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.870000,4672,3306,YES,NO,YES,0,0.870538,0.405822
4,1698005,NO,NO,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.087020,0.814240,1.112804,0.874318,...,0.0,0.870000,4400,3413,YES,NO,YES,2,0.224729,0.870909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1699996,NO,NO,Z6vucL/W0MPoFsgu2ewNXrvNCAQFiKzUJTYuqh6lP28=,yhI9Bw5Q8l1vEll4sw/Tem/jojpE9KwjKvQQIyrAqgU=,1.294118,0.000000,1.000000,0.000000,0.164141,...,0.0,1.000000,1188,918,YES,NO,YES,2,0.198257,0.155724
1996,1699997,NO,NO,LKQ9Uh6tQ3ZrIxAKaPaDEuiYFunnK/2d+oKAfpN9tuY=,h0cPLYjd7nmw9FJsQA+KUsnChH0SajbHjNdfMk47k9o=,1.020217,0.583944,0.625842,1.003516,0.791136,...,0.0,0.720000,4400,3413,YES,NO,YES,0,0.582479,0.778864
1997,1699998,NO,NO,/tuZYGMsFx4A/Ou+jSol6t/TpLRkSl8Ku+1tnQPvwww=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,0.354706,0.550882,0.930882,0.207941,0.207500,...,0.0,0.845000,4400,3400,NO,NO,NO,5,0.930588,0.201591
1998,1699999,NO,NO,uMIU2KDOxlgzhYToCFCa3nMxIOPV0WqCnKWfooGaw+8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.220588,0.102059,0.326176,1.213824,0.942955,...,1.5,0.676667,4400,3400,NO,NO,NO,2,0.256471,0.938182


In [None]:
# To find count of missing values
print('train_df',train_df.isnull().sum(axis=0))
print("test_df",test_df.isnull().sum(axis=0))

train_df id         0
x1      1426
x2      1426
x3      1426
x4      1426
        ... 
x141       0
x142       0
x143       0
x144       0
x145       0
Length: 146, dtype: int64
test_df id        0
x1      294
x2      294
x3      294
x4      294
       ... 
x141      0
x142      0
x143      0
x144      0
x145      0
Length: 146, dtype: int64


In [None]:
# Identify numerical and categorical columns

firstcolumn_test_df = pd.DataFrame({test_df.columns[0]: test_df.iloc[:, 0]})
train_df = train_df.iloc[:, 1:]
test_df = test_df.iloc[:, 1:]

numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# Identify hash features
hash_features = []
for col in categorical_cols:
    if 'YES' not in train_df[col].unique() or 'NO' not in train_df[col].unique():
        hash_features.append(col)

In [None]:
hash_features

['x3', 'x4', 'x34', 'x35', 'x61', 'x64', 'x65', 'x91', 'x94', 'x95']

In [None]:
# Identify bool columns
bool_cols = []
for col in categorical_cols:
    if col not in hash_features:
        bool_cols.append(col)

In [None]:
print(bool_cols)

['x1', 'x2', 'x10', 'x11', 'x12', 'x13', 'x14', 'x24', 'x25', 'x26', 'x30', 'x31', 'x32', 'x33', 'x41', 'x42', 'x43', 'x44', 'x45', 'x55', 'x56', 'x57', 'x62', 'x63', 'x71', 'x72', 'x73', 'x74', 'x75', 'x85', 'x86', 'x87', 'x92', 'x93', 'x101', 'x102', 'x103', 'x104', 'x105', 'x115', 'x116', 'x117', 'x126', 'x127', 'x128', 'x129', 'x130', 'x140', 'x141', 'x142']


In [None]:
# Impute missing values in hash features and bool features
most_frequent_imputer = SimpleImputer(strategy='most_frequent')

combined_features = hash_features + bool_cols

train_df[combined_features] = most_frequent_imputer.fit_transform(train_df[combined_features])
test_df[combined_features] = most_frequent_imputer.transform(test_df[combined_features])

In [None]:
test_df

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,NO,NO,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.414798,0.000000,1.000000,0.000000,0.202060,NO,...,0.0,1.000000,1262,892,NO,NO,NO,0,0.089686,0.193344
1,NO,NO,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,YES,...,1.0,0.866667,4672,3311,NO,NO,NO,5,0.945032,0.471318
2,NO,NO,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.000000,1.000000,0.000000,0.703088,NO,...,-1.0,1.000000,1263,892,NO,NO,NO,8,0.557175,0.693587
3,NO,NO,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,0.000000,0.000000,0.000000,0.000000,0.000000,NO,...,0.0,0.870000,4672,3306,YES,NO,YES,0,0.870538,0.405822
4,NO,NO,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.087020,0.814240,1.112804,0.874318,NO,...,0.0,0.870000,4400,3413,YES,NO,YES,2,0.224729,0.870909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,NO,NO,Z6vucL/W0MPoFsgu2ewNXrvNCAQFiKzUJTYuqh6lP28=,yhI9Bw5Q8l1vEll4sw/Tem/jojpE9KwjKvQQIyrAqgU=,1.294118,0.000000,1.000000,0.000000,0.164141,YES,...,0.0,1.000000,1188,918,YES,NO,YES,2,0.198257,0.155724
1996,NO,NO,LKQ9Uh6tQ3ZrIxAKaPaDEuiYFunnK/2d+oKAfpN9tuY=,h0cPLYjd7nmw9FJsQA+KUsnChH0SajbHjNdfMk47k9o=,1.020217,0.583944,0.625842,1.003516,0.791136,YES,...,0.0,0.720000,4400,3413,YES,NO,YES,0,0.582479,0.778864
1997,NO,NO,/tuZYGMsFx4A/Ou+jSol6t/TpLRkSl8Ku+1tnQPvwww=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,0.354706,0.550882,0.930882,0.207941,0.207500,NO,...,0.0,0.845000,4400,3400,NO,NO,NO,5,0.930588,0.201591
1998,NO,NO,uMIU2KDOxlgzhYToCFCa3nMxIOPV0WqCnKWfooGaw+8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.220588,0.102059,0.326176,1.213824,0.942955,NO,...,1.5,0.676667,4400,3400,NO,NO,NO,2,0.256471,0.938182


In [None]:
# Apply label encoding to bool columns
label_encoders = {}
for col in bool_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    label_encoders[col] = le

In [None]:
test_df

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,0,0,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.414798,0.000000,1.000000,0.000000,0.202060,0,...,0.0,1.000000,1262,892,0,0,0,0,0.089686,0.193344
1,0,0,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,1,...,1.0,0.866667,4672,3311,0,0,0,5,0.945032,0.471318
2,0,0,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.000000,1.000000,0.000000,0.703088,0,...,-1.0,1.000000,1263,892,0,0,0,8,0.557175,0.693587
3,0,0,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,0.000000,0.000000,0.000000,0.000000,0.000000,0,...,0.0,0.870000,4672,3306,1,0,1,0,0.870538,0.405822
4,0,0,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.087020,0.814240,1.112804,0.874318,0,...,0.0,0.870000,4400,3413,1,0,1,2,0.224729,0.870909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,Z6vucL/W0MPoFsgu2ewNXrvNCAQFiKzUJTYuqh6lP28=,yhI9Bw5Q8l1vEll4sw/Tem/jojpE9KwjKvQQIyrAqgU=,1.294118,0.000000,1.000000,0.000000,0.164141,1,...,0.0,1.000000,1188,918,1,0,1,2,0.198257,0.155724
1996,0,0,LKQ9Uh6tQ3ZrIxAKaPaDEuiYFunnK/2d+oKAfpN9tuY=,h0cPLYjd7nmw9FJsQA+KUsnChH0SajbHjNdfMk47k9o=,1.020217,0.583944,0.625842,1.003516,0.791136,1,...,0.0,0.720000,4400,3413,1,0,1,0,0.582479,0.778864
1997,0,0,/tuZYGMsFx4A/Ou+jSol6t/TpLRkSl8Ku+1tnQPvwww=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,0.354706,0.550882,0.930882,0.207941,0.207500,0,...,0.0,0.845000,4400,3400,0,0,0,5,0.930588,0.201591
1998,0,0,uMIU2KDOxlgzhYToCFCa3nMxIOPV0WqCnKWfooGaw+8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.220588,0.102059,0.326176,1.213824,0.942955,0,...,1.5,0.676667,4400,3400,0,0,0,2,0.256471,0.938182


In [None]:
# Impute missing values for numerical columns using KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)

train_df[numerical_cols] = knn_imputer.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = knn_imputer.transform(test_df[numerical_cols])

In [None]:
test_df

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,0,0,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.414798,0.000000,1.000000,0.000000,0.202060,0,...,0.0,1.000000,1262.0,892.0,0,0,0,0.0,0.089686,0.193344
1,0,0,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,1,...,1.0,0.866667,4672.0,3311.0,0,0,0,5.0,0.945032,0.471318
2,0,0,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.000000,1.000000,0.000000,0.703088,0,...,-1.0,1.000000,1263.0,892.0,0,0,0,8.0,0.557175,0.693587
3,0,0,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,0.000000,0.000000,0.000000,0.000000,0.000000,0,...,0.0,0.870000,4672.0,3306.0,1,0,1,0.0,0.870538,0.405822
4,0,0,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.087020,0.814240,1.112804,0.874318,0,...,0.0,0.870000,4400.0,3413.0,1,0,1,2.0,0.224729,0.870909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,Z6vucL/W0MPoFsgu2ewNXrvNCAQFiKzUJTYuqh6lP28=,yhI9Bw5Q8l1vEll4sw/Tem/jojpE9KwjKvQQIyrAqgU=,1.294118,0.000000,1.000000,0.000000,0.164141,1,...,0.0,1.000000,1188.0,918.0,1,0,1,2.0,0.198257,0.155724
1996,0,0,LKQ9Uh6tQ3ZrIxAKaPaDEuiYFunnK/2d+oKAfpN9tuY=,h0cPLYjd7nmw9FJsQA+KUsnChH0SajbHjNdfMk47k9o=,1.020217,0.583944,0.625842,1.003516,0.791136,1,...,0.0,0.720000,4400.0,3413.0,1,0,1,0.0,0.582479,0.778864
1997,0,0,/tuZYGMsFx4A/Ou+jSol6t/TpLRkSl8Ku+1tnQPvwww=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,0.354706,0.550882,0.930882,0.207941,0.207500,0,...,0.0,0.845000,4400.0,3400.0,0,0,0,5.0,0.930588,0.201591
1998,0,0,uMIU2KDOxlgzhYToCFCa3nMxIOPV0WqCnKWfooGaw+8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.220588,0.102059,0.326176,1.213824,0.942955,0,...,1.5,0.676667,4400.0,3400.0,0,0,0,2.0,0.256471,0.938182


In [None]:
# Standardize numerical features
scaler = StandardScaler()

train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

In [None]:
test_df

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,0,0,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,0.857439,-0.417309,0.589880,-0.524932,-0.818647,0,...,-0.037165,0.634811,-0.886124,-0.902211,0,0,0,-0.333004,-1.586709,-1.216260
1,0,0,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,-0.252074,-0.029485,0.436806,0.439880,0.112897,1,...,0.620360,-0.490459,1.261040,0.842482,0,0,0,0.120956,1.571035,-0.131738
2,0,0,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,0.859576,-0.417309,0.589880,-0.524932,0.848480,0,...,-0.694690,0.634811,-0.885495,-0.902211,0,0,0,0.393333,0.139155,0.735445
3,0,0,MZZbXga8gvaCBqWpzrh2iKdOkcsz/bG/z4BVjUnqWT0=,hCXwO/JldK5zcd9ejOD1FwmEgCf96eTdEVy7OtY2Y2g=,-1.839152,-0.417309,-2.241569,-0.524932,-1.490985,0,...,-0.037165,-0.462328,1.261040,0.838876,1,0,1,-0.333004,1.296023,-0.387274
4,0,0,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,0.313115,0.259911,0.063909,2.857412,1.418234,0,...,-0.037165,-0.462328,1.089771,0.916049,1,0,1,-0.151420,-1.088161,1.427271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,Z6vucL/W0MPoFsgu2ewNXrvNCAQFiKzUJTYuqh6lP28=,yhI9Bw5Q8l1vEll4sw/Tem/jojpE9KwjKvQQIyrAqgU=,0.627424,-0.417309,0.589880,-0.524932,-0.944818,1,...,-0.037165,0.634811,-0.932720,-0.883459,1,0,1,-0.151420,-1.185889,-1.363035
1996,0,0,LKQ9Uh6tQ3ZrIxAKaPaDEuiYFunnK/2d+oKAfpN9tuY=,h0cPLYjd7nmw9FJsQA+KUsnChH0SajbHjNdfMk47k9o=,0.105371,4.127135,-0.469528,2.525233,1.141454,1,...,-0.037165,-1.728257,1.089771,0.916049,1,0,1,-0.333004,0.232571,1.068154
1997,0,0,/tuZYGMsFx4A/Ou+jSol6t/TpLRkSl8Ku+1tnQPvwww=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,-1.163086,3.869840,0.394177,0.107101,-0.800546,0,...,-0.037165,-0.673316,1.089771,0.906673,0,0,0,0.120956,1.517713,-1.184084
1998,0,0,uMIU2KDOxlgzhYToCFCa3nMxIOPV0WqCnKWfooGaw+8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,0.487277,0.376947,-1.318017,3.164459,1.646616,0,...,0.949122,-2.093970,1.089771,0.906673,0,0,0,-0.151420,-0.970978,1.689737


In [None]:
# Combine hash features into a single string for vectorization
train_hash = train_df[hash_features].astype(str).apply(lambda x: ' '.join(x), axis=1)
test_hash = test_df[hash_features].astype(str).apply(lambda x: ' '.join(x), axis=1)

# Vectorize hash features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_hash)
X_test_tfidf = tfidf_vectorizer.transform(test_hash)

In [None]:
# Exclude features for sparse
train_df.drop(columns=hash_features, inplace=True)
test_df.drop(columns=hash_features, inplace=True)

# Convert DataFrames to sparse matrices
train_sparse = sp.csr_matrix(train_df.values)
test_sparse = sp.csr_matrix(test_df.values)

# Combine sparse matrices and TF-IDF features
X_train = sp.hstack((train_sparse, X_train_tfidf))
X_test = sp.hstack((test_sparse, X_test_tfidf))

In [None]:
print(X_train.toarray())
print(X_test.toarray())

[[ 0.          0.         -0.74023202 ...  0.          0.
   0.        ]
 [ 0.          0.         -1.83915188 ...  0.          0.
   0.        ]
 [ 0.          0.          0.71831091 ...  0.          0.
   0.        ]
 ...
 [ 1.          1.          0.85530271 ...  0.          0.
   0.        ]
 [ 0.          0.          0.62742352 ...  0.          0.
   0.        ]
 [ 1.          1.         -0.57948792 ...  0.          0.
   0.        ]]
[[ 0.          0.          0.85743947 ...  0.          0.
   0.        ]
 [ 0.          0.         -0.25207406 ...  0.          0.
   0.        ]
 [ 0.          0.          0.85957623 ...  0.          0.
   0.        ]
 ...
 [ 0.          0.         -1.16308599 ...  0.          0.
   0.        ]
 [ 0.          0.          0.48727719 ...  0.          0.
   0.        ]
 [ 0.          0.         -0.50605389 ...  0.          0.
   0.        ]]


In [None]:
new_train_labels_df = train_labels_df.iloc[:9999]
Y_train = new_train_labels_df.iloc[:, 1:]

In [None]:
# Initialize the base classifier
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Multi-output classifier
multi_target_classifier = MultiOutputClassifier(base_classifier, n_jobs=-1)

# Train the model
multi_target_classifier.fit(X_train, Y_train)

In [None]:
# Predict on test dataset
test_pred= multi_target_classifier.predict(X_test)

# Prepare submission file
output_df = pd.DataFrame(test_pred, columns=new_train_labels_df.columns[1:])

submission_df = pd.concat([firstcolumn_test_df, output_df], axis=1)
submission_df

Unnamed: 0,id,y1,y2,y3,y4,y5,y6,y7,y8,y9,...,y24,y25,y26,y27,y28,y29,y30,y31,y32,y33
0,1698001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1698002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1698003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1698004,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1698005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1699996,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,1699997,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1997,1699998,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1998,1699999,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# Convert the submission_df into desired output (idvalue_testlabelname, test_pred)

# Extract the ID values
ids = submission_df['id'].values

# Melting the DataFrame to reshape it to the desired format
melted_df = submission_df.melt(id_vars=['id'], var_name='label', value_name='pred')

# Extract the 'label_number' from'label'
melted_df['label_number'] = melted_df['label'].str.extract('(\d+)', expand=False).astype(int)

# Sort the DataFrame by 'id' and 'label_number'
melted_df.sort_values(by=['id', 'label_number'], inplace=True)

# Combine the ID values with label names
melted_df['id_label'] = melted_df['id'].astype(str) + '_' + melted_df['label']

# Select the final columns
final_submission_df = melted_df[['id_label', 'pred']]

# Drop Reset index
final_submission_df.reset_index(drop=True, inplace=True)

# Display the final DataFrame
final_submission_df

Unnamed: 0,id_label,pred
0,1698001_y1,0
1,1698001_y2,0
2,1698001_y3,0
3,1698001_y4,0
4,1698001_y5,0
...,...,...
65995,1700000_y29,0
65996,1700000_y30,0
65997,1700000_y31,0
65998,1700000_y32,0


In [None]:
# Save the DataFrame to a CSV file
final_submission_df.to_csv('Karthik_submissionfile.csv', index=False)