# Libraries

In [17]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, mean_squared_error, make_scorer,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

import nltk
from nltk.corpus import stopwords
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


# Read Dataset

In [18]:
raw_df = pd.read_csv('train-balanced-sarcasm.csv')
raw_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


# Data-preprocessing 

## Drop Features

Irrelevant: author, date, created_utc

In [22]:
# Drop NAs
raw_df.dropna(inplace=True)

# Select 100000 rows of sample
# Reset index so the cross validation later won't go wrong
#filter_df = raw_df.sample(n=100000, random_state=000).reset_index(drop=True)
filter_df = raw_df

# Drop irrelevant features
filter_df.drop(['author', 'date', 'created_utc'],axis=1)

# Data is balance, do not need oversampling
print(filter_df['label'].value_counts()) 

# Show the data
filter_df.head()

0    505405
1    505368
Name: label, dtype: int64


Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


## Categorical Process

Transform comments into TF-IDF vectors

In [23]:
# Instantiate the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Fit and transform the processed comments and parent_comment
tfidf_comment = tfidf_vectorizer.fit_transform(filter_df['comment'])
tfidf_parent_comment = tfidf_vectorizer.fit_transform(filter_df['parent_comment'])

# Display the shape of the resulting TF-IDF feature matrix
print(tfidf_comment.shape)
print(tfidf_parent_comment.shape)

(1010773, 2450010)
(1010773, 4926771)


We have got a comment TF-IDF matrix, containing 100000 rows and 436490 features
We have got a parent_comment TF-IDF matrix, containing 100000 rows and 899057 features

Transform categorical "subreddit" to dummy

In [5]:
categorical_columns = ['subreddit']
for i in categorical_columns: 
    filter_df = pd.concat([filter_df,pd.get_dummies(filter_df[i],drop_first=True, prefix=i)],axis=1)
    filter_df = filter_df.drop(i,axis=1)
    
filter_df.shape

(100000, 5594)

## Split Dataset

In [24]:
# Y is the response variable
Y = filter_df['label']

# X is the features
X = tfidf_comment

# Split the data (Train 0.8, Test 0.2)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=000)

## K-Fold CV Setup

In [25]:
# Set up K-Fold Cross Validation 
n_splits = 5
shuffle = True
random_state = 000
cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

## A Function to Create Dictionary

In [26]:
def create_dictionary(param_1,param_2):
    result_dictionary = {} 
    for i in param_1: 
        result_dictionary[i] = {} 
        for j in param_2: 
                result_dictionary[i][j] = {} 
    return result_dictionary

## Random Forest

Random Forest Cross Validation

In [29]:
# CV in Trees
# Set Hyperparameter (Lambda) values to cross validate
max_depth = [2, 5, 10, 15, 20, 25]
number_of_trees = [50, 100, 150, 200]

cross_validate_result = create_dictionary(number_of_trees, max_depth)
cross_validate_recall = create_dictionary(number_of_trees, max_depth)
cross_validate_precision = create_dictionary(number_of_trees, max_depth)
cross_validate_mse = create_dictionary(number_of_trees, max_depth)

for tree in number_of_trees:
    for depth in max_depth:
        print('Depth of Tree : ', depth, ' Number of Trees ', tree)

        accuracies = []
        recall_scores = []
        precision_scores = []
        mse_scores = []

        random_forest_cv = RandomForestClassifier(n_estimators=tree, max_depth=depth)

        for train_index, test_index in cv.split(X):
            # define the rows in the dataframe
            X_cv_train, X_cv_test = X[train_index], X[test_index]
            Y_cv_train, Y_cv_test = Y[train_index], Y[test_index]

            random_forest_cv.fit(X_cv_train, Y_cv_train)
            Y_pred = random_forest_cv.predict(X_cv_test)

            # Cross-Validation Prediction Error
            score = random_forest_cv.score(X_cv_test, Y_cv_test)
            accuracies.append(score)
            recall_scores.append(recall_score(Y_cv_test, Y_pred))
            precision_scores.append(precision_score(Y_cv_test, Y_pred))
            mse_scores.append(mean_squared_error(Y_cv_test, Y_pred))

        cross_validate_result[tree][depth] = (sum(accuracies) / len(accuracies))
        cross_validate_recall[tree][depth] = (sum(recall_scores) / len(recall_scores))
        cross_validate_precision[tree][depth] = (sum(precision_scores) / len(precision_scores))
        cross_validate_mse[tree][depth] = (sum(mse_scores) / len(mse_scores))

        print("Accuracy : " + str((sum(accuracies) / len(accuracies))))
        print("Precision : " + str((sum(recall_scores) / len(recall_scores))))
        print("Recall : " + str((sum(precision_scores) / len(precision_scores))))
        print("MSE : " + str((sum(mse_scores) / len(mse_scores))))
        print()

# Dictionary Summary
print('------------------')
print('Accuracy : ', cross_validate_result)
print('Precision : ', cross_validate_precision)
print('Recall : ', cross_validate_recall)
print('MSE : ', cross_validate_mse)

Depth of Tree :  2  Number of Trees  50


KeyboardInterrupt: 

Plot

In [None]:
plt.figure(figsize=(10, 8))

# Define line styles and colors for different number of trees
line_styles = {50: 'solid', 100: 'dashed', 150: 'dotted', 200: 'dashdot'}
colors = {'Accuracy': 'blue', 'Precision': 'green', 'Recall': 'red', 'MSE': 'grey'}

# Consolidate plotting data
metrics = {
    'Accuracy': cross_validate_result,
    'Precision': cross_validate_precision,
    'Recall': cross_validate_recall,
    'MSE': cross_validate_mse,
}

# Plotting
for metric_name, metric_data in metrics.items():
    for num_trees, depths in metric_data.items():
        plt.plot(max_depth, [depths[depth] for depth in max_depth],
                 label=f'{metric_name} ({num_trees} trees)',
                 linestyle=line_styles[num_trees],
                 color=colors[metric_name])

plt.title('Model Performance by Max Depth and Number of Trees')
plt.xlabel('Max Depth')
plt.ylabel('Metric Value')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

After selected the best parameter, run the random forest OOS

In [27]:
random_forest = RandomForestClassifier(n_estimators = 200, max_depth=10, random_state=0)
random_forest.fit(X, Y)

y_pred = random_forest.predict(X_test)
print("RND Forest Accuracy : " , accuracy_score(y_test, y_pred))
print("RND Forest Recall : " , recall_score(y_test, y_pred))
print("RND Forest Precision : ", precision_score(y_test,y_pred))
print("RND Forest F1 : ", f1_score(y_test,y_pred))
print("RND Forest MSE : ", mean_squared_error(y_test,y_pred))

RND Forest Accuracy :  0.6675916994385497
RND Forest Recall :  0.5343611985819816
RND Forest Precision :  0.7278723461652594
RND Forest F1 :  0.6162833192480757
RND Forest MSE :  0.3324083005614504


Extract the random forest coefficient to see its importance

In [28]:
idf_values = tfidf_vectorizer.idf_

# Get Feature Importance
importance = random_forest.feature_importances_

# Get Feature Name
feature_names = tfidf_vectorizer.get_feature_names_out()

# Pair Feature Importance and Name
features_importance = zip(feature_names, importance)

# Pair Feature IDF Value and Name
features_idf = zip(feature_names, idf_values)

# Sort feature by importance
sorted_features = sorted(features_importance, key=lambda x: x[1], reverse=True)

# Get first 50
for feature, importance in sorted_features[:50]:
    print(f"{feature}: {importance}")

later snowboard: 0.017216191437231947
cameras conveniently: 0.01491188864063314
hillary would: 0.011223054360433207
just freezes: 0.009869132529134393
camille deserves: 0.009790887421415524
judge what: 0.009211462689342129
happens won: 0.00898179141521509
amazeballs why: 0.008469001460810463
fucking giraffe: 0.008163975028692021
inspectors ad: 0.008129785720232178
fast method: 0.007830875326338296
hebrews just: 0.007592631274140501
565 rest: 0.006958159439414971
american dnr: 0.006457834724192914
city reputation: 0.0064499339152156955
around what: 0.006386466207715914
land damaging: 0.00533912693940992
kershaw shocking: 0.005027458234483149
fame each: 0.004960030623059647
gets squeed: 0.004859906694155918
later issued: 0.004799917116109559
at dncmedaparty: 0.004727503193279378
foundation tax: 0.004266122935963653
apart via: 0.00426259473309641
impossible super: 0.004138159023838988
exactly sex: 0.004124119785610934
first motorcycle: 0.0040412519522353765
later waiting: 0.00401148807531

Eliminate Stop Words

In [29]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Filter Stop Words
filtered_features = [(feature, importance) for feature, importance in sorted_features if feature not in stop_words]

# Print Filtered Features
for feature, importance in filtered_features[:100]: # Get First 100
    print(f"{feature}: {importance}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16920\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


later snowboard: 0.017216191437231947
cameras conveniently: 0.01491188864063314
hillary would: 0.011223054360433207
just freezes: 0.009869132529134393
camille deserves: 0.009790887421415524
judge what: 0.009211462689342129
happens won: 0.00898179141521509
amazeballs why: 0.008469001460810463
fucking giraffe: 0.008163975028692021
inspectors ad: 0.008129785720232178
fast method: 0.007830875326338296
hebrews just: 0.007592631274140501
565 rest: 0.006958159439414971
american dnr: 0.006457834724192914
city reputation: 0.0064499339152156955
around what: 0.006386466207715914
land damaging: 0.00533912693940992
kershaw shocking: 0.005027458234483149
fame each: 0.004960030623059647
gets squeed: 0.004859906694155918
later issued: 0.004799917116109559
at dncmedaparty: 0.004727503193279378
foundation tax: 0.004266122935963653
apart via: 0.00426259473309641
impossible super: 0.004138159023838988
exactly sex: 0.004124119785610934
first motorcycle: 0.0040412519522353765
later waiting: 0.00401148807531

Visualization

In [None]:
# 分别提取特征名和重要性得分
features = [feature for feature, _ in filtered_features]
importances = [importance for _, importance in filtered_features]

# 绘图
plt.figure(figsize=(10, 8))
plt.barh(features, importances, color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # 反转y轴，使最重要的特征显示在顶部
plt.show()

#待跑

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.barplot(x=importances, y=features, palette="viridis")

plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Top Features Importance from Random Forest (Filtered)')
plt.show()

#待跑

## Adaboost

Adaptive Boosting Algorithm Cross Validation

In [20]:
# CV in Trees 
# Set Hyperparameter (Lambda) values to cross validate

# Create a weak learner (a stump)
weak_learner = DecisionTreeClassifier(max_depth=1)

learning_rates = [0.1, 0.01, 0.001]
number_of_trees_ada = [50, 100, 150, 200]

cross_validate_result = create_dictionary(learning_rates,number_of_trees_ada)
cross_validate_recall = create_dictionary(learning_rates,number_of_trees_ada)
cross_validate_precision = create_dictionary(learning_rates,number_of_trees_ada) 
cross_validate_mse = create_dictionary(learning_rates,number_of_trees_ada)

for rate in learning_rates: 
    for tree in number_of_trees_ada:
        print('Depth of Tree : ', tree, ' Learning rate ', rate)
        accuracies = []
        recall_scores = [] 
        precision_scores = [] 
        mse_scores = []
        adaboost_classifier = AdaBoostClassifier(estimator=weak_learner, n_estimators=tree, random_state=0)
        for train_index, test_index in cv.split(X):
            # define the rows in the dataframe
            X_cv_train, X_cv_test = X[train_index], X[test_index]
            Y_cv_train, Y_cv_test = Y[train_index], Y[test_index]
            
            adaboost_classifier.fit(X_cv_train, Y_cv_train)
            Y_pred = adaboost_classifier.predict(X_cv_test)

            #Cross-Validation Prediction Error
            score = adaboost_classifier.score(X_cv_test, Y_cv_test)
            accuracies.append(score)
            recall_scores.append(recall_score(Y_cv_test, Y_pred))
            precision_scores.append(precision_score(Y_cv_test,Y_pred))
            mse_scores.append(mean_squared_error(Y_cv_test, Y_pred))
        
        cross_validate_result[rate][tree] = (sum(accuracies)/len(accuracies))
        cross_validate_recall[rate][tree] = (sum(recall_scores)/len(recall_scores))
        cross_validate_precision[rate][tree] = (sum(precision_scores)/len(precision_scores))
        cross_validate_mse[rate][tree] = (sum(mse_scores)/len(mse_scores))

        print("Accuracy : " + str((sum(accuracies)/len(accuracies))))
        print("Precision : " + str((sum(recall_scores)/len(recall_scores))))
        print("Recall : " + str((sum(precision_scores)/len(precision_scores))))
        print("MSE : " + str((sum(mse_scores)/len(mse_scores))))
        print() 

# Dictionary Summary
print('------------------')
print('Accuracy : ', cross_validate_result)
print('Precision : ', cross_validate_precision)
print('Recall : ', cross_validate_recall)
print('MSE : ', cross_validate_mse)

#待跑

Depth of Tree :  50  Learning rate  0.1


KeyboardInterrupt: 

Plot

In [None]:
plt.figure(figsize=(10, 8))

# Define line styles and colors for different number of trees
line_styles = {50: 'solid', 100: 'dashed', 150: 'dotted', 200: 'dashdot'}
colors = {'Accuracy': 'blue', 'Precision': 'green', 'Recall': 'red', 'MSE': 'grey'}

# Consolidate plotting data
metrics = {
    'Accuracy': cross_validate_result,
    'Precision': cross_validate_precision,
    'Recall': cross_validate_recall,
    'MSE': cross_validate_mse,
}

# Plotting
for metric_name, metric_data in metrics.items():
    for num_trees, rates in metric_data.items():
        plt.plot(number_of_trees_ada, [rates[rate] for rate in rates],
                 label=f'{metric_name} ({num_trees} trees)',
                 linestyle=line_styles[num_trees],
                 color=colors[metric_name])

plt.title('Model Performance by Learning Rate and Number of Trees')
plt.xlabel('Learning Rate')
plt.ylabel('Metric Value')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Adaboost Rerun

In [21]:
# Create a weak learner (a stump)
weak_learner = DecisionTreeClassifier(max_depth=1)

# Create an AdaBoost classifier using the weak learner
adaboost_classifier = AdaBoostClassifier(estimator=weak_learner, n_estimators=1000, random_state=0)

# Train the AdaBoost classifier
adaboost_classifier.fit(X, Y)

y_pred = adaboost_classifier.predict(X_test)
print("Adaboost Accuracy : " , accuracy_score(Y_test, Y_pred))
print("Adaboost Recall : " , recall_score(y_OOS_test, y_pred))
print("Adaboost Precision : ", precision_score(y_OOS_test,y_pred))
print("Adaboost f1 : ", f1_score(y_OOS_test,y_pred))
print("Adaboost MSE : ", mean_squared_error(y_OOS_test,y_pred))

NameError: name 'X_smote' is not defined