## Loading the Twitter Data Set and Dropping 

In [3]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pickle


In [4]:
# Charger les données
data = pd.read_csv("twitter_human_bots_dataset.csv")

# Suppréssion des caractéristiques non-numériques
columns_to_drop=['id','Unnamed: 0','created_at', 'description', 'profile_image_url', 'profile_background_image_url', 'screen_name', 'location', 'lang']
X_without_string = data.drop(columns=columns_to_drop)

# Séparation des caractéristiques (X) et de l'étiquette (y)
X = X_without_string.drop('account_type', axis=1)
y = data['account_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [55]:
scaler = StandardScaler().fit(X_train)

In [5]:
X_train.head()

Unnamed: 0,default_profile,default_profile_image,favourites_count,followers_count,friends_count,geo_enabled,statuses_count,verified,average_tweets_per_day,account_age_days
2084,True,False,16227,7058,712,True,35077,True,7.607,4611
21286,True,False,3269,807159,579,True,2284,True,0.917,2492
29621,True,False,2231,19,125,True,3092,False,1.629,1898
26836,False,False,788,43912,141,True,16966,False,4.164,4074
32198,False,False,29210,867154,738,False,50870,True,14.622,3479


In [6]:
X_train.columns

Index(['default_profile', 'default_profile_image', 'favourites_count',
       'followers_count', 'friends_count', 'geo_enabled', 'statuses_count',
       'verified', 'average_tweets_per_day', 'account_age_days'],
      dtype='object')

In [56]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
Best_RF = RandomForestClassifier(criterion='gini',max_depth=100,min_samples_split=10,n_estimators=100)

In [59]:
Best_RF.fit(X_train_scaled,y_train)

In [60]:
y_pred = Best_RF.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bot       0.86      0.76      0.81      3069
       human       0.89      0.94      0.91      6291

    accuracy                           0.88      9360
   macro avg       0.87      0.85      0.86      9360
weighted avg       0.88      0.88      0.88      9360



In [61]:
with open('best_rf_model.pkl', 'wb') as file:
    pickle.dump(Best_RF, file)

In [62]:
with open('scaler','wb') as file:
    pickle.dump(scaler,file)