# Libraries

In [58]:
import pandas as pd

from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer





import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, mean_squared_error, make_scorer,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.ensemble import AdaBoostClassifier

from xgboost import XGBClassifier  # Assuming you are using XGBoost for classification; use XGBRegressor for regression
import xgboost as xgb 

from sklearn.metrics import confusion_matrix

import pprint
from tabulate import tabulate

from sklearn import linear_model

from imblearn.over_sampling import SMOTE


# Read Dataset

In [59]:
raw_df = pd.read_csv('train-balanced-sarcasm.csv')
raw_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


# Data-preprocessing 

## Drop Features

Irrelevant: author, date, created_utc

In [60]:
# Drop NAs
raw_df.dropna(inplace=True)

# Select 100000 rows of sample
# Reset index so the cross validation later won't go wrong
filter_df = raw_df.sample(n=100000, random_state=000).reset_index(drop=True)

# Drop irrelevant features
filter_df.drop(['author', 'date', 'created_utc'],axis=1)

# Data is balance, do not need oversampling
print(filter_df['label'].value_counts()) 

# Show the data
filter_df.head()

0    50016
1    49984
Name: label, dtype: int64


Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,"The title of this article should be: ""How to n...",xNOM,MensRights,1,1,0,2014-11,2014-11-07 11:48:34,You can approach women without being creepy
1,0,What a wasted opportunity... at least be funny...,Jacked1218,MMA,5,5,0,2016-07,2016-07-05 22:06:19,Nate Diaz Snapchat Hacked
2,1,But....but... sodium!,Chicup,fatlogic,1,1,0,2015-03,2015-03-09 13:16:40,Canned soups have been hugely helpful for the ...
3,1,"Yeah, we need more animosity between nations.",Bloodysneeze,worldnews,1,1,0,2013-07,2013-07-09 15:20:36,It really sounds like all the English speaking...
4,0,uuugh,name032282,Minecraft,2,2,0,2011-04,2011-04-02 04:50:42,Has anyone held a doggy funeral on their serve...


## Categorical Process

Transform comments into TF-IDF vectors

In [61]:
# Instantiate the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Fit and transform the processed comments and parent_comment
tfidf_comment = tfidf_vectorizer.fit_transform(filter_df['comment'])
tfidf_parent_comment = tfidf_vectorizer.fit_transform(filter_df['parent_comment'])

# Display the shape of the resulting TF-IDF feature matrix
print(tfidf_comment.shape)
print(tfidf_parent_comment.shape)

(100000, 436490)
(100000, 899057)


We have got a comment TF-IDF matrix, containing 100000 rows and 436490 features
We have got a parent_comment TF-IDF matrix, containing 100000 rows and 899057 features

Transform categorical "subreddit" to dummy

In [62]:
categorical_columns = ['subreddit']
for i in categorical_columns: 
    filter_df = pd.concat([filter_df,pd.get_dummies(filter_df[i],drop_first=True, prefix=i)],axis=1)
    filter_df = filter_df.drop(i,axis=1)
    
filter_df.shape

(100000, 5594)

## Split Dataset

In [63]:
# Y is the response variable
Y = filter_df['label']

# X is the features
X = tfidf_comment

# Split the data (Train 0.8, Test 0.2)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=000)

## K-Fold CV Setup

In [64]:
# Set up K-Fold Cross Validation 
n_splits = 5
shuffle = True
random_state = 000
cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

## A Function to Create Dictionary

In [65]:
def create_dictionary(param_1,param_2):
    result_dictionary = {} 
    for i in param_1: 
        result_dictionary[i] = {} 
        for j in param_2: 
                result_dictionary[i][j] = {} 
    return result_dictionary

## Random Forest

Random Forest Cross Validation

In [67]:
# CV in Trees  
# Set Hyperparameter (Lambda) values to cross validate here !!!!! 
max_depth = [2,5,10,15,20,25]
number_of_trees = [51,101,151,201] 

cross_validate_result = create_dictionary(number_of_trees,max_depth)
cross_validate_recall = create_dictionary(number_of_trees,max_depth)
cross_validate_precision = create_dictionary(number_of_trees,max_depth)
cross_validate_mse = create_dictionary(number_of_trees, max_depth)

with open('Random Forest CV Summary.txt', 'w') as file:
    
    for tree in number_of_trees: 
        for depth in max_depth:
            print('Depth of Tree : ', depth, ' Number of Trees ', tree)
            file.write(f'Depth of Tree : {depth}, Number of Trees: {tree}\n')
            
            accuracies = []
            recall_scores = [] 
            precision_scores = [] 
            mse_scores = []

            random_forest_cv = RandomForestClassifier(n_estimators = tree, max_depth=depth)
        
            for train_index, test_index in cv.split(X):
                # change to loc to define the rows in the dataframe 
                X_cv_train, X_cv_test = X[train_index], X[test_index]
                Y_cv_train, Y_cv_test = Y[train_index], Y[test_index]
            
                random_forest_cv.fit(X_cv_train, Y_cv_train)
                Y_pred = random_forest_cv.predict(X_cv_test)

                #Cross-Validation Prediction Error
                score = random_forest_cv.score(X_cv_test, Y_cv_test)
                accuracies.append(score)
                recall_scores.append(recall_score(Y_cv_test, Y_pred))
                precision_scores.append(precision_score(Y_cv_test,Y_pred))
                mse_scores.append(mean_squared_error(Y_cv_test, Y_pred))
        
            cross_validate_result[tree][depth] = (sum(accuracies)/len(accuracies))
            cross_validate_recall[tree][depth] = (sum(recall_scores)/len(recall_scores))
            cross_validate_precision[tree][depth] = (sum(precision_scores)/len(precision_scores))
            cross_validate_mse[tree][depth] = (sum(mse_scores) / len(mse_scores))
            
            file.write(f"Accuracy: {np.mean(accuracies)}\n")
            file.write(f"Precision: {np.mean(precision_scores)}\n")
            file.write(f"Recall: {np.mean(recall_scores)}\n")
            file.write(f"MSE: {np.mean(mse_scores)}\n\n")

        print("Accuracy : " + str((sum(accuracies)/len(accuracies))))
        print("Precision : " + str((sum(recall_scores)/len(recall_scores))))
        print("Recall : " + str((sum(precision_scores)/len(precision_scores))))
        print("MSE : " + str((sum(mse_scores) / len(mse_scores))))
        print() 
        
    # Dictionary Summary
    print('------------------')
    print('Accuracy : ', cross_validate_result)
    print('Precision : ',cross_validate_precision)
    print('Recall : ',cross_validate_recall)
    print('MSE : ', cross_validate_mse)
    
    file.write('------------------\n')
    file.write('Accuracy Summary:\n')
    file.write(str(cross_validate_result) + '\n')
    file.write('Precision Summary:\n')
    file.write(str(cross_validate_precision) + '\n')
    file.write('Recall Summary:\n')
    file.write(str(cross_validate_recall) + '\n')
    file.write('MSE Summary:\n')
    file.write(str(cross_validate_mse) + '\n')

Depth of Tree :  5  Number of Trees  50
Accuracy : 0.59803
Precision : 0.3825504935784383
Recall : 0.6773511191699465

Depth of Tree :  10  Number of Trees  50
Accuracy : 0.62404
Precision : 0.467190124517038
Recall : 0.6825602446111234

Depth of Tree :  20  Number of Trees  50
Accuracy : 0.63994
Precision : 0.5092531345465913
Recall : 0.6897218127610609

Depth of Tree :  5  Number of Trees  100
Accuracy : 0.62166
Precision : 0.4726726076503482
Recall : 0.6765865582782878

Depth of Tree :  10  Number of Trees  100
Accuracy : 0.64376
Precision : 0.5075998038122622
Recall : 0.7003556176931325

Depth of Tree :  20  Number of Trees  100
Accuracy : 0.6550100000000001
Precision : 0.5329496851245575
Recall : 0.7054703721406496

Depth of Tree :  5  Number of Trees  150
Accuracy : 0.6336600000000001
Precision : 0.49299248489183994
Recall : 0.6901094132379519

Depth of Tree :  10  Number of Trees  150
Accuracy : 0.65209
Precision : 0.5186244109213808
Recall : 0.7099828440343202

Depth of Tree : 

Plot

In [80]:
# Plotting all metrics in a single plot with different line styles and colors to distinguish them

plt.figure(figsize=(10, 8))

# Define line styles and colors for different number of trees
line_styles = {50: 'solid', 100: 'dashed', 150: 'dotted'}
colors = {'Accuracy': 'blue', 'Precision': 'green', 'Recall': 'red', 'MSE': 'purple'}

# Consolidate plotting data
metrics = {
    'Accuracy': cross_validate_result,
    'Precision': cross_validate_precision,
    'Recall': cross_validate_recall,
    #'MSE': cross_validate_mse,
}

# Plotting
for metric_name, metric_data in metrics.items():
    for num_trees, depths in metric_data.items():
        plt.plot(max_depth, [depths[depth] for depth in max_depth], 
                 label=f'{metric_name} ({num_trees} trees)', 
                 linestyle=line_styles[num_trees], 
                 color=colors[metric_name])

plt.title('Model Performance by Max Depth and Number of Trees')
plt.xlabel('Max Depth')
plt.ylabel('Metric Value')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

KeyError: 4

<Figure size 1000x800 with 0 Axes>

After selected the best parameter, rerun the random forest

In [None]:
random_forest = RandomForestClassifier(n_estimators = 200, max_depth=5, random_state=0)
random_forest.fit(X, Y)

y_pred = random_forest.predict(X_test)
print("RND Forest Accuracy : " , accuracy_score(y_test, y_pred))
print("RND Forest Recall : " , recall_score(y_test, y_pred))
print("RND Forest Precision : ", precision_score(y_test,y_pred))
print("RND Forest F1 : ", f1_score(y_test,y_pred))
print("RND Forest MSE : ", mean_squared_error(y_test,y_pred))