In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:

# Load dataset
df = pd.read_csv('train_data.csv')

# Data Cleaning
def clean_text(text):
    # Implement text cleaning
    return text
df['text']=df.iloc[:,0]
df['label']=df.iloc[:,1]
df['cleaned_text'] = df['text'].apply(clean_text)


In [3]:

# Feature Engineering
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:

# Model Training and Tuning
def train_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_



In [5]:

# Decision Tree
dt_param_grid = {'max_depth': [10, 20, 30]}
dt_model = train_model(DecisionTreeClassifier(), dt_param_grid)



In [6]:

# Random Forest
rf_param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
rf_model = train_model(RandomForestClassifier(), rf_param_grid)


In [7]:

# Gradient Boosting
gb_param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.1, 0.01]}
gb_model = train_model(GradientBoostingClassifier(), gb_param_grid)


In [11]:
# Model Evaluation
models = {'Decision Tree': dt_model, 'Random Forest': rf_model, 'Gradient Boosting': gb_model}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

# Cross-Validation
skf = StratifiedKFold(n_splits=5)
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {scores.mean()}")


Decision Tree Accuracy: 0.5248730964467005
              precision    recall  f1-score   support

       ANGRY       0.55      0.24      0.34       185
        FEAR       0.56      0.36      0.44        66
       HAPPY       0.89      0.62      0.73       306
       OTHER       0.39      0.82      0.53       267
         SAD       0.47      0.25      0.32       161

    accuracy                           0.52       985
   macro avg       0.57      0.46      0.47       985
weighted avg       0.60      0.52      0.52       985

[[ 45   5  10 112  13]
 [  3  24   1  34   4]
 [  9   3 190  98   6]
 [ 11   8   7 218  23]
 [ 14   3   6  98  40]]
Random Forest Accuracy: 0.5208121827411167
              precision    recall  f1-score   support

       ANGRY       0.73      0.19      0.30       185
        FEAR       0.00      0.00      0.00        66
       HAPPY       0.70      0.75      0.73       306
       OTHER       0.40      0.88      0.55       267
         SAD       0.74      0.09     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Gradient Boosting Accuracy: 0.6091370558375635
              precision    recall  f1-score   support

       ANGRY       0.59      0.43      0.50       185
        FEAR       0.64      0.55      0.59        66
       HAPPY       0.87      0.74      0.80       306
       OTHER       0.48      0.69      0.57       267
         SAD       0.49      0.46      0.48       161

    accuracy                           0.61       985
   macro avg       0.62      0.57      0.59       985
weighted avg       0.63      0.61      0.61       985

[[ 79   6  12  71  17]
 [  2  36   1  20   7]
 [ 13   1 226  50  16]
 [ 25   9  12 185  36]
 [ 15   4   9  59  74]]
Decision Tree Cross-Validation Accuracy: 0.49867731418430933
Random Forest Cross-Validation Accuracy: 0.5074126119433784
Gradient Boosting Cross-Validation Accuracy: 0.5833888407411993


In [12]:

# Final Inference
# Load test set and make predictions
test_df = pd.read_csv('test_data.csv')

test_df['text']=test_df.iloc[:,0]
test_df['label']=test_df.iloc[:,1]
test_df['cleaned_text'] = test_df['text'].apply(clean_text)
X_test_final = vectorizer.transform(test_df['cleaned_text'])
final_predictions = rf_model.predict(X_test_final)

# Save predictions
pd.DataFrame({'text': test_df['text'], 'predicted_label': final_predictions}).to_csv('final_predictions.csv', index=False)
