In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['patch.force_edgecolor'] = True
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score

In [39]:
# Load your CSV file

df = pd.read_csv('resources/bot_detection_data.csv')

# Display the first few rows to check the data
print(df.head())


   User ID        Username                                              Tweet  \
0   132131           flong  Station activity person against natural majori...   
1   289683  hinesstephanie  Authority research natural life material staff...   
2   779715      roberttran  Manage whose quickly especially foot none to g...   
3   696168          pmason  Just cover eight opportunity strong policy which.   
4   704441          noah87                      Animal sign six data good or.   

   Retweet Count  Mention Count  Follower Count  Verified  Bot Label  \
0             85              1            2353     False          1   
1             55              5            9617      True          0   
2              6              2            4363      True          0   
3             54              5            2242      True          1   
4             26              3            8438     False          1   

       Location           Created At            Hashtags  
0     Adkinston  2020

In [40]:
# Check all column names
print(df.columns)



Index(['User ID', 'Username', 'Tweet', 'Retweet Count', 'Mention Count',
       'Follower Count', 'Verified', 'Bot Label', 'Location', 'Created At',
       'Hashtags'],
      dtype='object')


In [41]:
# Get unique values for each column separately
user_id_unique = df['User ID'].unique()
username_unique = df['Username'].unique()
tweet_unique = df['Tweet'].unique()
retweet_count_unique = df['Retweet Count'].unique()
mention_count_unique = df['Mention Count'].unique()
follower_count_unique = df['Follower Count'].unique()
verified_unique = df['Verified'].unique()
bot_label_unique = df['Bot Label'].unique()
location_unique = df['Location'].unique()
created_at_unique = df['Created At'].unique()
hashtags_unique = df['Hashtags'].unique()

# Print unique values for each column
print('Unique values in User ID:', user_id_unique)
print('Unique values in Username:', username_unique)
print('Unique values in Tweet:', tweet_unique)
print('Unique values in Retweet Count:', retweet_count_unique)
print('Unique values in Mention Count:', mention_count_unique)
print('Unique values in Follower Count:', follower_count_unique)
print('Unique values in Verified:', verified_unique)
print('Unique values in Bot Label:', bot_label_unique)
print('Unique values in Location:', location_unique)
print('Unique values in Created At:', created_at_unique)
print('Unique values in Hashtags:', hashtags_unique)


Unique values in User ID: [132131 289683 779715 ... 674475 167081 311204]
Unique values in Username: ['flong' 'hinesstephanie' 'roberttran' ... 'lynncunningham'
 'richardthompson' 'daniel29']
Unique values in Tweet: ['Station activity person against natural majority none few size expect six marriage.'
 'Authority research natural life material staff rate common protect attention.'
 'Manage whose quickly especially foot none to goal range case.' ...
 'Bring different everyone international capital government yeah.'
 'Than about single generation itself seek sell still hour her.'
 'Here morning class various room human true because lot send attention.']
Unique values in Retweet Count: [ 85  55   6  54  26  41  64  25  67  57  29  60  61  21  78  43  39   8
  84  86  56  49   7  75  77  40  15  13  63  58  34  66  18   0  72  24
  87  44  46  10  23  28   3  71  89  59  97  70  88  69  81  99  37   4
  93  30  74  62  90  20  98  76  92   5  17  52  83  35  45  91   9  65
  19  51  73  11

In [42]:
print(df.columns)


Index(['User ID', 'Username', 'Tweet', 'Retweet Count', 'Mention Count',
       'Follower Count', 'Verified', 'Bot Label', 'Location', 'Created At',
       'Hashtags'],
      dtype='object')


In [43]:
# Step 0: Before cleaning - Print basic info and summary
print("Before Cleaning:")
print(df.info())  # Info about columns, data types, and non-null counts
print("\nSummary statistics (before):")
print(df.describe(include='all'))  # Summary statistics before cleaning

# Step 2: Handle missing values
df['Username'] = df['Username'].fillna('Unknown')
df['Location'] = df['Location'].fillna('Unknown')
df['Verified'] = df['Verified'].fillna(False)  # Assuming False is appropriate for missing values
df['Bot Label'] = df['Bot Label'].fillna(0)  # Assuming 0 is a reasonable fill for bots
df['Hashtags'] = df['Hashtags'].fillna('')  # Filling NaNs in Hashtags with an empty string

# For numerical columns
df['Follower Count'] = df['Follower Count'].fillna(df['Follower Count'].mean())
df['Retweet Count'] = df['Retweet Count'].fillna(0)  # Assuming 0 is a reasonable fill
df['Mention Count'] = df['Mention Count'].fillna(0)  # Assuming 0 is a reasonable fill

# Step 3: Standardize categorical values (if needed)
df['Username'] = df['Username'].str.title()  # Capitalize usernames
df['Location'] = df['Location'].str.title()  # Capitalize locations

# Step 4: Convert data types
df['Created At'] = pd.to_datetime(df['Created At'], errors='coerce')  # Convert to datetime
df['Follower Count'] = df['Follower Count'].astype(int)
df['Retweet Count'] = df['Retweet Count'].astype(int)
df['Mention Count'] = df['Mention Count'].astype(int)

# Step 5: Remove duplicates
df = df.drop_duplicates()

# Step 6: After cleaning - Print basic info and summary
print("\nAfter Cleaning:")
print(df.info())  # Info about columns, data types, and non-null counts
print("\nSummary statistics (after):")
print(df.describe(include='all'))  # Summary statistics after cleaning


Before Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   User ID         50000 non-null  int64 
 1   Username        50000 non-null  object
 2   Tweet           50000 non-null  object
 3   Retweet Count   50000 non-null  int64 
 4   Mention Count   50000 non-null  int64 
 5   Follower Count  50000 non-null  int64 
 6   Verified        50000 non-null  bool  
 7   Bot Label       50000 non-null  int64 
 8   Location        50000 non-null  object
 9   Created At      50000 non-null  object
 10  Hashtags        41659 non-null  object
dtypes: bool(1), int64(5), object(5)
memory usage: 3.9+ MB
None

Summary statistics (before):
              User ID Username  \
count    50000.000000    50000   
unique            NaN    40566   
top               NaN   ksmith   
freq              NaN       21   
mean    548890.680540      NaN   
std 

In [44]:
# Step 1: Drop rows with NaN values in the Bot Label column (previously ISBOT)
data_cleaned = df.dropna(subset=['Bot Label'])

# Step 2: Separate bots and non-bots based on the Bot Label column
bots = data_cleaned[data_cleaned['Bot Label'] == 1]  # Assuming 1 indicates a bot
nonbots = data_cleaned[data_cleaned['Bot Label'] == 0]  # Assuming 0 indicates a non-bot

# Optional: print the counts
print("Number of bots:", bots.shape[0])
print("Number of non-bots:", nonbots.shape[0])


Number of bots: 25018
Number of non-bots: 24982


In [45]:
# Display the unique values in the ISBOT column
print(data_cleaned['Bot Label'].unique())

[1 0]


In [46]:
# Count the unique values
bot_label_counts = data_cleaned['Bot Label'].value_counts()
print(bot_label_counts)

Bot Label
1    25018
0    24982
Name: count, dtype: int64


In [47]:
print(df.columns)


Index(['User ID', 'Username', 'Tweet', 'Retweet Count', 'Mention Count',
       'Follower Count', 'Verified', 'Bot Label', 'Location', 'Created At',
       'Hashtags'],
      dtype='object')


In [48]:
# Load your CSV file
df = pd.read_csv('resources/bot_detection_data.csv')

# Check available columns
print("Available columns:", df.columns)

# Drop an unnecessary column (e.g., Bot Label)
df.drop(columns=['Bot Label'], inplace=True)  # Make sure 'Bot Label' is indeed the column you want to drop

# Handling missing values for other relevant columns
df['Location'].fillna('Unknown', inplace=True)  # Fill NaN in Location
df['Hashtags'].fillna('None', inplace=True)     # Fill NaN in Hashtags

# For numerical columns, fill NaNs with mean or appropriate values
df['Follower Count'].fillna(df['Follower Count'].mean(), inplace=True)
df['Retweet Count'].fillna(df['Retweet Count'].mean(), inplace=True)
df['Mention Count'].fillna(df['Mention Count'].mean(), inplace=True)

# Check the DataFrame after cleaning
print("DataFrame after cleaning:")
print(df.head())


Available columns: Index(['User ID', 'Username', 'Tweet', 'Retweet Count', 'Mention Count',
       'Follower Count', 'Verified', 'Bot Label', 'Location', 'Created At',
       'Hashtags'],
      dtype='object')
DataFrame after cleaning:
   User ID        Username                                              Tweet  \
0   132131           flong  Station activity person against natural majori...   
1   289683  hinesstephanie  Authority research natural life material staff...   
2   779715      roberttran  Manage whose quickly especially foot none to g...   
3   696168          pmason  Just cover eight opportunity strong policy which.   
4   704441          noah87                      Animal sign six data good or.   

   Retweet Count  Mention Count  Follower Count  Verified      Location  \
0             85              1            2353     False     Adkinston   
1             55              5            9617      True    Sanderston   
2              6              2            4363     

In [49]:
#  'data_cleaned' is your final DataFrame after cleaning
data_cleaned.to_csv('cleaned_data.csv', index=False)
