In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

file_path = "../data_sets/dataset_raw.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,category,text
0,0,i didnt feel humiliated
1,0,i can go from feeling so hopeless to so damned...
2,3,im grabbing a minute to post i feel greedy wrong
3,2,i am ever feeling nostalgic about the fireplac...
4,3,i am feeling grouchy


In [2]:
X = data['text']
y = data['category']
# Vectorizing the data
vectorizer_try = CountVectorizer(stop_words='english', min_df=0.0003, ngram_range=(1, 4))
X_encoded = vectorizer_try.fit_transform(X)

# Printing data shape
print('Data shape: ', X_encoded.shape)


# Splitting the data and renaming variables
X_train, X_temp, y_train, y_temp = train_test_split(X_encoded, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Displaying the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

Data shape:  (20000, 5009)
X_train shape: (12000, 5009)
X_val shape: (4000, 5009)
X_test shape: (4000, 5009)
y_train shape: (12000,)
y_val shape: (4000,)
y_test shape: (4000,)


**Random Forest Classifier**

In [3]:
def random_forest_classification(X_train, y_train, X_val, y_val, X_test, y_test):
    # Define the parameter grid for Random Forest
    param_grid = {
        'n_estimators': [100, 150],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5]
    }

    # Create Random Forest classifier
    classifier = RandomForestClassifier(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Best parameters and model
    best_classifier = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)

    # Validation accuracy
    val_predictions = best_classifier.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_predictions)
    print('\nValidation accuracy:', val_accuracy)

    # Validation F1 score
    val_f1 = f1_score(y_val, val_predictions, average='weighted')
    print('\nF1 Score (Validation):', val_f1)

    # Test accuracy
    test_predictions = best_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print('\nTesting accuracy:', test_accuracy)

    # Test F1 score
    test_f1 = f1_score(y_test, test_predictions, average='weighted')
    print('\nF1 Score (Test):', test_f1)

    return best_classifier

In [4]:
best_rf_classifier = random_forest_classification(X_train, y_train, X_val, y_val, X_test, y_test)

Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}

Validation accuracy: 0.88025

F1 Score (Validation): 0.8801039211409051

Testing accuracy: 0.901

F1 Score (Test): 0.9005558497846045


In [5]:
best_classifier = RandomForestClassifier(max_depth=None, n_estimators=150, min_samples_split=5, random_state=42)
best_classifier.fit(X_train, y_train)

In [6]:
val_predictions = best_classifier.predict(X_val)
val_accuracy = np.mean(val_predictions == y_val)
print('\nValidation accuracy:', format(100 * val_accuracy, '.4f'))

# Compute and print the confusion matrix for validation data
cm_val = confusion_matrix(y_val, val_predictions)
print('\nConfusion Matrix (Validation):')
print(cm_val)

# Compute and print F1 score on the validation data
val_f1 = f1_score(y_val, val_predictions, average='weighted')
print('\nF1 Score (Validation):', format(val_f1, '.4f'))

# Compute and print accuracy on the test data
test_predictions = best_classifier.predict(X_test)
test_accuracy = np.mean(test_predictions == y_test)
print('\nTesting accuracy:', format(100 * test_accuracy, '.4f'))

# Compute and print the confusion matrix for test data
cm_test = confusion_matrix(y_test, test_predictions)
print('\nConfusion Matrix (Test):')
print(cm_test)

# Compute and print F1 score on the test data
test_f1 = f1_score(y_test, test_predictions, average='weighted')
print('\nF1 Score (Test):', format(test_f1, '.4f'))


Validation accuracy: 88.0250

Confusion Matrix (Validation):
[[1076   31   10   22   37    4]
 [  42 1193   48   13   13    3]
 [   4   81  259    3    2    0]
 [  27   22    6  478   17    1]
 [  10   11    0   11  393   29]
 [   4    8    0    2   18  122]]

F1 Score (Validation): 0.8801

Testing accuracy: 90.1000

Confusion Matrix (Test):
[[1085   24    6   33   24    3]
 [  33 1281   36    9   11    3]
 [   3   56  258    3    4    1]
 [  27   20    0  486    9    1]
 [  11    8    2   15  398   14]
 [   1    9    1    2   27   96]]

F1 Score (Test): 0.9006


In [7]:
# Feature importance
feature_importances = best_classifier.feature_importances_

# Top N feature importances and their contribution
N = 10
top_n_indices = feature_importances.argsort()[-N:][::-1]
top_n_feature_names = [vectorizer_try.get_feature_names_out()[i] for i in top_n_indices]
top_n_feature_importances = feature_importances[top_n_indices]

top_n_feature_names, top_n_feature_importances
feature_importance_df = pd.DataFrame({
    'Feature': top_n_feature_names,
    'Importance': top_n_feature_importances
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

feature_importance_df

Unnamed: 0,Feature,Importance
0,shaken,0.005969
1,impressed,0.005344
2,terrified,0.005322
3,amazed,0.005249
4,apprehensive,0.00522
5,curious,0.005069
6,bothered,0.005019
7,weird,0.004919
8,violent,0.004895
9,reluctant,0.004799
