# modelling all file

In [1]:
import os
import pandas as pd
import numpy as np
import time
import joblib as jb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold
pd.set_option('display.max_colwidth',2000)
pd.set_option('display.max_columns', 2000)

In [2]:
start = time.time()
f1_ = []
recall_ = []
std_dev_f1 = []
std_dev_recall = []
files_ = []
directory = "labeled_data\\"
for root,dirs,files in os.walk(directory):
    for file in files:
        if file.endswith(".csv"):
            print()
            print("-----------------------------------------")
            print()
            print("File no:", file)
            print()
            
            #reading the csv
            data = pd.read_csv(directory+file)
            #Applying log transormation on the numerical features
            data['followers'] = np.log1p(data["followers"])
            data["listed_count"] = np.log1p(data["listed_count"])
            data['Avg_favorites_per_post'] = np.log1p(data["Avg_favorites_per_post"])
            data['listed_follower_ratio'] = np.log1p(data["listed_follower_ratio"])
            data['follower-following_ratio'] =  np.log1p(data["follower-following_ratio"])
            
            #Encoding categorical features
            tweet_type = pd.get_dummies(data.tweet_type)
            data = pd.concat([data,tweet_type], axis = "columns")
            
            #Creating target and independent variable
            data = data.reindex(columns=['followers','listed_count','Avg_favorites_per_post','follower-following_ratio','listed_follower_ratio','Positive tweet','Negative tweet','target'])
            X = data.iloc[ : , :7]
            y = data.target
            
            cv = StratifiedKFold(n_splits= 10)
            
            w0 = len(X) / (2 * y.value_counts()[0])
            w1 =  len(X) / (2 * y.value_counts()[1])
            
            class_weight = {0:w0,1:w1}

            # Number of trees in random forest
            n_estimators = [int(x) for x in np.linspace(20,100, num = 5)]
            # Maximum number of levels in tree
            max_depth = [int(x) for x in np.linspace(20,100, num = 5)]
            max_depth.append(None)
            bootstrap = [True, False]
            criterion = ['entropy','gini']

            # Create the random grid
            random_grid = {'n_estimators': n_estimators,
                           'max_depth': max_depth,
                           'bootstrap': bootstrap,
                           'criterion' : criterion
                           }
            
            scores = ['f1_macro', 'recall_macro']
            model =  RandomForestClassifier(class_weight =class_weight)
            rf_random = GridSearchCV(estimator = model, param_grid  = random_grid,verbose=2, cv = cv,  n_jobs = -1,scoring = scores,refit='f1_macro')
            rf_random.fit(X, y)
            paramter = rf_random.best_params_
            
            f1_scores = []
            recall_scores = []

            for train_index,test_index in cv.split(X,y):
                X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index],y.iloc[test_index]

                w0 = len(X) / (2 * y_train.value_counts()[0])
                w1 =  len(X) / (2 * y_train.value_counts()[1])

                class_weight = {0:w0,1:w1}
                model = RandomForestClassifier(n_estimators = paramter['n_estimators'], max_depth = paramter['max_depth'], class_weight =class_weight,bootstrap = paramter['bootstrap'],criterion = paramter['criterion'])
                model.fit(X_train,y_train)
                y_pred = model.predict(X_test)
                f1_scores.append(f1_score(y_test,y_pred, average = 'macro',labels=np.unique(y_pred)))
                recall_scores.append(recall_score(y_pred,y_test,average = 'macro',labels=np.unique(y_pred)))
                
            f1_.append(round(np.mean(f1_scores),4))
            recall_.append(round(np.mean(recall_scores),4))
            std_dev_f1.append(round(np.std(f1_scores),4))
            std_dev_recall.append(round(np.std(recall_scores),4))
            
            print("F1 score:", round(np.mean(f1_scores),4))
            print("Std dev of F1 score:",round(np.std(f1_scores),4))
            print("Recall:", round(np.mean(recall_scores),4))
            print("Std dev of Recall:", round(np.std(recall_scores),4))
            
            file = file.split('.')
            file = file[0].split("_")
            file = file[1] + "_" + file[2]
            path = "saved_model\\" + file
            jb.dump(model, path)
            files_.append(file)
            

result = pd.DataFrame(
    {'dataset' : files_,
     'F1-score': f1_,
     'recall': recall_,
     'std_dev_f1':std_dev_f1,
     'std_dev_recall' : std_dev_recall
    })

folder = 'result\\'
result.to_csv(folder + 'result.csv', index = False)

end = time.time()
print()
print("=================================")
print("Total Time Taken:",end - start)
print("=================================")
print()
print("=============================")
print("Average F1-score:", round(np.mean(f1_),4))
print("Max F1-score:",round(np.max(f1_),4))
print("Min F1-score:",round(np.min(f1_),4))
print("Average Recall:", round(np.mean(recall_),4))
print("Max Recall:",round(np.max(recall_),4))
print("Min Recall:",round(np.min(recall_),4))
print("=============================")


-----------------------------------------

File no: cleaned_topic_1.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7409
Std dev of F1 score: 0.0509
Recall: 0.7861
Std dev of Recall: 0.0642

-----------------------------------------

File no: cleaned_topic_10.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.722
Std dev of F1 score: 0.0292
Recall: 0.7482
Std dev of Recall: 0.0227

-----------------------------------------

File no: cleaned_topic_100.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7536
Std dev of F1 score: 0.1118
Recall: 0.7874
Std dev of Recall: 0.1201

-----------------------------------------

File no: cleaned_topic_11.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7013
Std dev of F1 score: 0.0629
Recall: 0.7547
Std dev of Recall: 0.0711

-----------------------------------------

File no: cleaned_topic_12.csv

Fitting 10 folds for e

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7828
Std dev of F1 score: 0.0223
Recall: 0.8008
Std dev of Recall: 0.023

-----------------------------------------

File no: cleaned_topic_42.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7048
Std dev of F1 score: 0.08
Recall: 0.7395
Std dev of Recall: 0.0786

-----------------------------------------

File no: cleaned_topic_43.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7104
Std dev of F1 score: 0.051
Recall: 0.7229
Std dev of Recall: 0.0488

-----------------------------------------

File no: cleaned_topic_44.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7585
Std dev of F1 score: 0.0147
Recall: 0.7798
Std dev of Recall: 0.0153

-----------------------------------------

File no: cleaned_topic_45.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7568
Std dev of F1 scor

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7185
Std dev of F1 score: 0.0561
Recall: 0.7857
Std dev of Recall: 0.0791

-----------------------------------------

File no: cleaned_topic_76.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.8139
Std dev of F1 score: 0.0834
Recall: 0.8264
Std dev of Recall: 0.091

-----------------------------------------

File no: cleaned_topic_77.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.755
Std dev of F1 score: 0.0691
Recall: 0.813
Std dev of Recall: 0.074

-----------------------------------------

File no: cleaned_topic_78.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7647
Std dev of F1 score: 0.0582
Recall: 0.8148
Std dev of Recall: 0.0599

-----------------------------------------

File no: cleaned_topic_79.csv

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
F1 score: 0.7724
Std dev of F1 scor