In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
df = pd.read_csv('bot_detection_data.csv')
df.head()

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,11-05-2020 15:29,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,26-11-2022 05:18,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,08-08-2022 03:16,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,14-08-2021 22:27,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,13-04-2020 21:24,foreign mention


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   User ID         50000 non-null  int64 
 1   Username        50000 non-null  object
 2   Tweet           50000 non-null  object
 3   Retweet Count   50000 non-null  int64 
 4   Mention Count   50000 non-null  int64 
 5   Follower Count  50000 non-null  int64 
 6   Verified        50000 non-null  bool  
 7   Bot Label       50000 non-null  int64 
 8   Location        50000 non-null  object
 9   Created At      50000 non-null  object
 10  Hashtags        41659 non-null  object
dtypes: bool(1), int64(5), object(5)
memory usage: 3.9+ MB


In [4]:
df.isna().sum()

User ID              0
Username             0
Tweet                0
Retweet Count        0
Mention Count        0
Follower Count       0
Verified             0
Bot Label            0
Location             0
Created At           0
Hashtags          8341
dtype: int64

In [5]:
#adding our own data in missing hastag rows
missing_values_mask=df['Hashtags'].isnull()

df.loc[missing_values_mask,'Hashtags']='No Hashtag'#add No Hashtag at missing cell
df['Hashtags']

0                             No Hashtag
1                              both live
2                            phone ahead
3                     ever quickly new I
4                        foreign mention
                      ...               
49995    teach quality ten education any
49996             add walk among believe
49997            onto admit artist first
49998                               star
49999                               home
Name: Hashtags, Length: 50000, dtype: object

In [6]:
df['Created At']=pd.to_datetime(df['Created At'], format='%d-%m-%Y %H:%M')
print(df['Created At'].dtypes)

datetime64[ns]


In [7]:
# Assuming you have a DataFrame named 'data' with multiple columns

# List of columns to drop
columns_to_drop = ['User ID', 'Retweet Count', 'Mention Count','Follower Count','Verified','Bot Label','Hashtags']

# Drop the specified columns from the DataFrame
data_dropped = df.drop(columns=columns_to_drop)

# Get value counts for each remaining column
column_counts = {}
for column in data_dropped.columns:
    column_counts[column] = data_dropped[column].value_counts()

# Print the value counts for each column
for column, counts in column_counts.items():
    print(f"Value counts for {column}:")
    print(counts)
    print()

Value counts for Username:
Username
ksmith             21
usmith             16
msmith             16
vmiller            15
ismith             13
                   ..
jessica57           1
ggraham             1
john93              1
gallowaymichael     1
daniel29            1
Name: count, Length: 40566, dtype: int64

Value counts for Tweet:
Tweet
Station activity person against natural majority none few size expect six marriage.        1
Institution second billion over song either arm.                                           1
However plan meeting certain dinner card produce wear whether give hour something.         1
Total least today until clear nearly economy book single with successful.                  1
Full likely beautiful example partner process top catch control natural lead push help.    1
                                                                                          ..
News society threat positive someone accept stand pressure life so describe pretty.        1

In [8]:
#identify inconsistent formatting for textual(categorical too) columns

print(df['Hashtags'].value_counts())

Hashtags
No Hashtag                            8341
area                                    21
big                                     20
treat                                   19
ground                                  18
                                      ... 
president conference field process       1
market live mouth sit wide               1
your five                                1
serious not Democrat                     1
onto admit artist first                  1
Name: count, Length: 34248, dtype: int64


In [9]:
# Filter data based on bot label
bot_data = df[df['Bot Label'] == 1]
non_bot_data = df[df['Bot Label'] == 0]

In [10]:
bot_followers_stats = bot_data['Follower Count'].describe()
bot_followers_stats

count    25018.000000
mean      4991.944280
std       2876.289818
min          0.000000
25%       2497.000000
50%       4978.000000
75%       7468.000000
max      10000.000000
Name: Follower Count, dtype: float64

In [11]:
non_bot_followers_stats = non_bot_data['Follower Count'].describe()
non_bot_followers_stats

count    24982.000000
mean      4985.255664
std       2881.251104
min          0.000000
25%       2480.250000
50%       5007.500000
75%       7472.000000
max      10000.000000
Name: Follower Count, dtype: float64

In [12]:
bot_retweet_stats = bot_data['Retweet Count'].describe()
bot_retweet_stats

count    25018.000000
mean        50.042050
std         29.171048
min          0.000000
25%         25.000000
50%         50.000000
75%         75.000000
max        100.000000
Name: Retweet Count, dtype: float64

In [13]:
non_bot_retweet_stats = non_bot_data['Retweet Count'].describe()
non_bot_retweet_stats

count    24982.000000
mean        49.969098
std         29.191822
min          0.000000
25%         25.000000
50%         50.000000
75%         75.000000
max        100.000000
Name: Retweet Count, dtype: float64

In [14]:
bot_mention_stats = bot_data['Mention Count'].describe()
bot_mention_stats

count    25018.000000
mean         2.501959
std          1.704641
min          0.000000
25%          1.000000
50%          3.000000
75%          4.000000
max          5.000000
Name: Mention Count, dtype: float64

In [15]:
non_bot_mention_stats = non_bot_data['Mention Count'].describe()
non_bot_mention_stats

count    24982.000000
mean         2.525578
std          1.712435
min          0.000000
25%          1.000000
50%          3.000000
75%          4.000000
max          5.000000
Name: Mention Count, dtype: float64

In [16]:
#label encoding of boolean data
label_encoder=LabelEncoder()
df['Verified']=label_encoder.fit_transform(df['Verified'])
df

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,0,1,Adkinston,2020-05-11 15:29:00,No Hashtag
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,1,0,Sanderston,2022-11-26 05:18:00,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,1,0,Harrisonfurt,2022-08-08 03:16:00,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,1,1,Martinezberg,2021-08-14 22:27:00,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,0,1,Camachoville,2020-04-13 21:24:00,foreign mention
...,...,...,...,...,...,...,...,...,...,...,...
49995,491196,uberg,Want but put card direction know miss former h...,64,0,9911,1,1,Lake Kimberlyburgh,2023-04-20 11:06:00,teach quality ten education any
49996,739297,jessicamunoz,Provide whole maybe agree church respond most ...,18,5,9900,0,1,Greenbury,2022-10-18 03:57:00,add walk among believe
49997,674475,lynncunningham,Bring different everyone international capital...,43,3,6313,1,1,Deborahfort,2020-07-08 03:54:00,onto admit artist first
49998,167081,richardthompson,Than about single generation itself seek sell ...,45,1,6343,0,0,Stephenside,2022-03-22 12:13:00,star


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   User ID         50000 non-null  int64         
 1   Username        50000 non-null  object        
 2   Tweet           50000 non-null  object        
 3   Retweet Count   50000 non-null  int64         
 4   Mention Count   50000 non-null  int64         
 5   Follower Count  50000 non-null  int64         
 6   Verified        50000 non-null  int64         
 7   Bot Label       50000 non-null  int64         
 8   Location        50000 non-null  object        
 9   Created At      50000 non-null  datetime64[ns]
 10  Hashtags        50000 non-null  object        
dtypes: datetime64[ns](1), int64(6), object(4)
memory usage: 4.2+ MB


In [18]:
# Combine all text columns into a single Series
text_data = df['Tweet'] + ' ' + df['Username']  #+ ' ' + df['Hashtags']+' '+df['Location']

# Text vectorization using TF-IDF (sparse representation)
vectorizer = TfidfVectorizer()
text_sparse = vectorizer.fit_transform(text_data)
text_sparse = text_sparse.astype('float64')

In [19]:
print(text_sparse)

  (0, 11858)	0.46481328154873097
  (0, 24073)	0.25355889816889193
  (0, 34453)	0.2548062005793583
  (0, 11495)	0.2591631402114371
  (0, 34454)	0.25986428901276315
  (0, 11669)	0.25721738720798487
  (0, 27882)	0.2530346272292299
  (0, 23571)	0.2559032100500245
  (0, 27273)	0.2545360295186273
  (0, 367)	0.2553514009893523
  (0, 29399)	0.2548062005793583
  (0, 171)	0.2548062005793583
  (0, 35014)	0.25400045051628134
  (1, 14818)	0.5319139828680403
  (1, 2513)	0.27194443790651635
  (1, 29984)	0.2653535175406968
  (1, 7338)	0.26824604325879187
  (1, 30679)	0.2655349424182583
  (1, 34968)	0.2698103676280364
  (1, 24473)	0.2656259137071012
  (1, 22582)	0.2673880968407885
  (1, 31084)	0.2697110832769586
  (1, 2582)	0.26682456892609696
  (1, 27273)	0.26729371300388916
  (2, 31920)	0.5303089121758462
  :	:
  (49997, 5286)	0.3028287451234789
  (49997, 4443)	0.3029374431359621
  (49998, 31424)	0.530438550394934
  (49998, 15094)	0.2680810065220719
  (49998, 33757)	0.26946006780870024
  (49998, 3444

In [20]:
# Combine text features with additional features
additional_features = df[[ 'Verified','Follower Count','Created At']]

In [21]:
additional_features['Created At'] = additional_features['Created At'].astype(np.int64) # Convert to Unix timestamp
additional_features = additional_features.astype('float64')  # Convert to float64

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  additional_features['Created At'] = additional_features['Created At'].astype(np.int64) # Convert to Unix timestamp


In [22]:
combined_sparse = hstack((text_sparse, additional_features))

In [23]:
rfc = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(combined_sparse, df['Bot Label'], test_size=0.2, random_state=2)

In [24]:
forest_params = [{'max_depth': list(range(10, 15)),
                'max_features': list(range(0,14)),
                "min_samples_leaf":range(1,15,1),
                "min_samples_split":range(2,10,1) }]

clf = GridSearchCV(rfc, forest_params, cv = 3, scoring='accuracy')

clf.fit(X_train, y_train)

In [25]:
# clf.best_params_

In [26]:
Bernoulli_cf = BernoulliNB()

In [27]:
Bernoulli_cf.fit(X_train,y_train)

In [28]:
Bernoulli_train_pred = Bernoulli_cf.predict(X_train)
print("Traning accuracy of Bernoulli naive bayes : ",accuracy_score(y_train,Bernoulli_train_pred))

Traning accuracy of Bernoulli naive bayes :  0.925275


In [29]:
Bernoulli_pred = Bernoulli_cf.predict(X_test)
print("Testing accuracy of Bernoulli naive bayes : ",accuracy_score(y_test,Bernoulli_pred))

Testing accuracy of Bernoulli naive bayes :  0.4903


In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_sparse, df['Bot Label'], test_size=0.2, random_state=122)

# Create an instance of the Random Forest classifier
rf_classifier = RandomForestClassifier(max_depth=14, max_features=3)

# Train the Random Forest classifier
rf_classifier.fit(X_train, y_train)

In [31]:
rf_train_predict = rf_classifier.predict(X_train)
accuracy_score(y_train, rf_train_predict)

0.536375

In [32]:
# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.5049
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.12      0.20      4979
           1       0.50      0.88      0.64      5021

    accuracy                           0.50     10000
   macro avg       0.51      0.50      0.42     10000
weighted avg       0.51      0.50      0.42     10000



In [33]:
new_data = pd.DataFrame({

    'Tweet': ['Just cover eight opportunity strong policy which.'],

    'Username': ['pmason'],

    # 'Hashtags': ['neever quickly new Iw'],

    # 'Retweet Count': [54],

    'Verified': [1],

    # 'Location' : ['Martinezberg'],

    # 'Mention Count' : [5],

    'Follower Count' : [2242],

    'Created At' : ['14-08-2021  22:27:00']
})

# Concatenate text features

new_text_data = new_data['Tweet'] + ' ' + new_data['Username'] #+ ' ' + new_data['Hashtags']+' '+new_data['Location']

# Perform feature vectorization on new text features

new_text_sparse = vectorizer.transform(new_text_data)


# Combine new text features with additional features

# new_additional_features = new_data[['Retweet Count', 'Verified','Mention Count','Follower Count','Created At']]
new_additional_features = new_data[['Verified','Follower Count','Created At']]


new_additional_features['Created At'] = pd.to_datetime(new_additional_features['Created At'])
new_additional_features['Created At'] = new_additional_features['Created At'].apply(lambda x: int(datetime.timestamp(x)))


# new_additional_features = new_additional_features.astype('float64')
new_additional_features['Created At'] = new_additional_features['Created At'].astype(int)  # Convert to Unix timestamp
new_additional_features = new_additional_features.astype('float64')


new_combined_sparse = hstack((new_text_sparse, new_additional_features))

# Predict labels for the new data
new_predictions = rf_classifier.predict(new_combined_sparse)

  new_additional_features['Created At'] = pd.to_datetime(new_additional_features['Created At'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_additional_features['Created At'] = pd.to_datetime(new_additional_features['Created At'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_additional_features['Created At'] = new_additional_features['Created At'].apply(lambda x: int(datetime.timestamp(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentati

In [34]:
# Print the predictions

for i, prediction in enumerate(new_predictions):

    if prediction == 1:

        print(f"Data point {i+1}: Bot")

    else:

        print(f"Data point {i+1}: Not Bot")

Data point 1: Bot


In [39]:
pickle.dump(rf_classifier,open('model.pkl','wb'))