<a href="https://colab.research.google.com/github/Gowtham6699/TextProphet-Multilabel-Text-Classification-with-Ensemble-Techniques/blob/main/Untitled26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the datasets
train_features = pd.read_csv('/content/train.csv')
train_labels = pd.read_csv('/content/trainLabels.csv')
test_features = pd.read_csv('/content/test.csv', header = None)
test_features.columns = train_features.columns

common_columns = train_features.columns.intersection(test_features.columns)
train_features = train_features[common_columns]
test_features = test_features[common_columns]

# Debug: Check if DataFrames are empty
print("Train features shape:", train_features.shape)
print("Test features shape:", test_features.shape)

# Debug: Check data types
print("Train features data types:\n", train_features.dtypes)
print("Test features data types:\n", test_features.dtypes)

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train_features_imputed = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_features_imputed = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)

# Convert boolean values to numerical
train_features_imputed = train_features_imputed.replace({'YES': 1, 'NO': 0})
test_features_imputed = test_features_imputed.replace({'YES': 1, 'NO': 0})

# Encode categorical features
from sklearn.preprocessing import LabelEncoder

for column in train_features_imputed.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    train_features_imputed[column] = le.fit_transform(train_features_imputed[column])

    # Transform test data, handling unseen labels
    test_values = test_features_imputed[column].astype(str).tolist()
    known_labels = set(le.classes_)
    transformed_values = [le.transform([x])[0] if x in known_labels else -1 for x in test_values]  # -1 for unseen labels
    test_features_imputed[column] = transformed_values

# Further steps for feature engineering and model training...

Train features shape: (9999, 146)
Test features shape: (2000, 146)
Train features data types:
 id        int64
x1       object
x2       object
x3       object
x4       object
         ...   
x141     object
x142     object
x143      int64
x144    float64
x145    float64
Length: 146, dtype: object
Test features data types:
 id        int64
x1       object
x2       object
x3       object
x4       object
         ...   
x141     object
x142     object
x143      int64
x144    float64
x145    float64
Length: 146, dtype: object


In [42]:
# Perform feature engineering using existing column names
if 'x1' in train_features_imputed.columns and 'x2' in train_features_imputed.columns:
    # Example 1: Interaction Terms
    train_features_imputed['interaction_feature'] = train_features_imputed['x1'] * train_features_imputed['x2']
    test_features_imputed['interaction_feature'] = test_features_imputed['x1'] * test_features_imputed['x2']

    # Example 2: Polynomial Features
    from sklearn.preprocessing import PolynomialFeatures

    poly = PolynomialFeatures(degree=2, include_bias=False)
    train_features_poly = poly.fit_transform(train_features_imputed)
    test_features_poly = poly.transform(test_features_imputed)

    # Example 3: Dimensionality Reduction (PCA)
    from sklearn.decomposition import PCA

    pca = PCA(n_components=10)
    train_features_pca = pca.fit_transform(train_features_imputed)
    test_features_pca = pca.transform(test_features_imputed)
else:
    print("One or more specified columns do not exist in the DataFrame.")


In [43]:
print("Number of samples in train_features_imputed:", train_features_imputed.shape[0])
print("Number of samples in train_labels:", train_labels.shape[0])


Number of samples in train_features_imputed: 9999
Number of samples in train_labels: 49999


In [44]:
# Adjust the number of samples in train_labels to match train_features_imputed
train_labels_matched = train_labels[:train_features_imputed.shape[0]]

# Now train_labels_matched and train_features_imputed should have the same number of samples
print("Number of samples in train_features_imputed:", train_features_imputed.shape[0])
print("Number of samples in train_labels_matched:", train_labels_matched.shape[0])

# Proceed with data splitting and model training using train_features_imputed and train_labels_matched


Number of samples in train_features_imputed: 9999
Number of samples in train_labels_matched: 9999


In [47]:
print(train_labels_matched[:5])


   id  y1  y2  y3  y4  y5  y6  y7  y8  y9  ...  y24  y25  y26  y27  y28  y29  \
0   1   0   0   0   0   0   0   0   0   0  ...    0    0    0    0    0    0   
1   2   0   0   0   0   0   0   0   0   0  ...    0    0    0    0    0    0   
2   3   0   0   1   0   0   0   0   0   0  ...    0    0    0    0    0    0   
3   4   0   0   0   0   0   0   0   0   0  ...    0    0    0    0    0    0   
4   5   0   0   0   0   0   0   0   0   0  ...    0    0    0    0    0    0   

   y30  y31  y32  y33  
0    0    0    0    1  
1    0    0    1    0  
2    0    0    0    0  
3    0    0    0    1  
4    0    0    0    1  

[5 rows x 34 columns]


In [49]:
from sklearn.preprocessing import MultiLabelBinarizer

# Assuming train_labels_matched is a list of lists or a similar format
mlb = MultiLabelBinarizer()
y_train_binary = mlb.fit_transform(train_labels_matched)

# Verify the shape
print(f"Shape of y_train_binary: {y_train_binary.shape}")  # Should be (9999, number_of_unique_labels)


Shape of y_train_binary: (34, 13)


In [51]:
train_labels_matched = train_labels[:train_features_imputed.shape[0]]
y_train_binary = mlb.fit_transform(train_labels_matched)

In [52]:
train_features_imputed = train_features_imputed[:y_train_binary.shape[0]]

In [53]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features_imputed, y_train_binary, test_size=0.2, random_state=42)

# Check the shapes of the split datasets
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_val: {y_val.shape}")


Shape of X_train: (27, 147)
Shape of y_train: (27, 13)
Shape of X_val: (7, 147)
Shape of y_val: (7, 13)


In [64]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the model
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_model = MultiOutputClassifier(base_model, n_jobs=-1)

# Train the model
multi_target_model.fit(X_train, y_train)

# Evaluate the model on the validation set
predictions = multi_target_model.predict(X_val).reshape(-1, 13)

# Evaluate predictions
print(classification_report(y_val, predictions))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         3
           2       0.33      0.25      0.29         4
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00         7

   micro avg       0.73      0.40      0.52        20
   macro avg       0.10      0.10      0.10        20
weighted avg       0.42      0.40      0.41        20
 samples avg       0.83      0.40      0.51        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
print(classification_report(y_val, predictions, zero_division=1))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         0
           1       0.00      0.00      0.00         3
           2       0.33      0.25      0.29         4
           3       1.00      1.00      1.00         0
           4       1.00      0.00      0.00         1
           5       1.00      0.00      0.00         1
           6       1.00      0.00      0.00         1
           7       1.00      0.00      0.00         1
           8       1.00      0.00      0.00         1
           9       1.00      0.00      0.00         1
          10       1.00      1.00      1.00         0
          11       1.00      1.00      1.00         0
          12       1.00      1.00      1.00         7

   micro avg       0.73      0.40      0.52        20
   macro avg       0.87      0.40      0.41        20
weighted avg       0.72      0.40      0.41        20
 samples avg       0.83      0.40      0.51        20



In [66]:
import numpy as np

# Sum the occurrences of each label
label_counts = np.sum(y_train, axis=0)
print(f"Label counts: {label_counts}")


Label counts: [ 3 10  9  7  2  2  2  2  2  2  1  1 26]


In [67]:
import numpy as np

unique, counts = np.unique(y_train_binary, return_counts=True)
print(dict(zip(unique, counts)))


{0: 353, 1: 89}


In [68]:
from sklearn.metrics import classification_report

# Evaluate the model on the validation set
predictions = multi_target_model.predict(X_val)

# Evaluate predictions with zero_division set to handle undefined metrics
print(classification_report(y_val, predictions, zero_division=0))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         3
           2       0.33      0.25      0.29         4
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00         7

   micro avg       0.73      0.40      0.52        20
   macro avg       0.10      0.10      0.10        20
weighted avg       0.42      0.40      0.41        20
 samples avg       0.83      0.40      0.51        20



In [69]:
from sklearn.metrics import multilabel_confusion_matrix

# Generate the confusion matrix
cm = multilabel_confusion_matrix(y_val, predictions)
print(cm)


[[[7 0]
  [0 0]]

 [[3 1]
  [3 0]]

 [[1 2]
  [3 1]]

 [[7 0]
  [0 0]]

 [[6 0]
  [1 0]]

 [[6 0]
  [1 0]]

 [[6 0]
  [1 0]]

 [[6 0]
  [1 0]]

 [[6 0]
  [1 0]]

 [[6 0]
  [1 0]]

 [[7 0]
  [0 0]]

 [[7 0]
  [0 0]]

 [[0 0]
  [0 7]]]


In [75]:
import numpy as np
import pandas as pd

# Assuming multi_target_model is your trained model
# Assuming X_test_combined is your test feature set after preprocessing

# Use the trained model to predict probabilities for the test set
predictions = multi_target_model.predict_proba(X_val)

# Flatten the predictions for each sample into a single string
predictions_str = []
for i, prediction in enumerate(predictions):
    prediction_str = ','.join(map(str, prediction))
    predictions_str.append(prediction_str)

# Create a DataFrame with ID and predictions
submission_df = pd.DataFrame({'id': range(1, len(predictions_str) + 1),
                              'predictions': predictions_str})

# Save the predictions into a CSV file
submission_df.to_csv('submission.csv', index=False)
