In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['patch.force_edgecolor'] = True
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score

In [149]:
# Load your CSV file

df = pd.read_csv('resources/kaggle_bot_accounts.csv')

# Display the first few rows to check the data
print(df.head())


   Unnamed: 0                     NAME  GENDER                    EMAIL_ID  \
0           0        Johnny KerrThomas    Male     jacksonalan@example.com   
1           1        Dwayne LarsenLara    Male        calvin80@example.com   
2           2                      NaN    Male          qbrown@example.net   
3           3  Russell SimmonsPhillips    Male  kimberlywagner@example.com   
4           4     Jamie WilsonMartinez  Female     shaunbrooks@example.com   

  IS_GLOGIN  FOLLOWER_COUNT  FOLLOWING_COUNT  DATASET_COUNT  CODE_COUNT  \
0     False            53.0             87.0            5.0         3.0   
1      True            16.0             67.0            5.0         NaN   
2      True            44.0             81.0            4.0        17.0   
3      True            23.0            114.0            5.0        24.0   
4     False            46.0            112.0            2.0        12.0   

   DISCUSSION_COUNT  AVG_NB_READ_TIME_MIN REGISTRATION_IPV4  \
0             124

In [150]:
# Check all column names
print(df.columns)



Index(['Unnamed: 0', 'NAME', 'GENDER', 'EMAIL_ID', 'IS_GLOGIN',
       'FOLLOWER_COUNT', 'FOLLOWING_COUNT', 'DATASET_COUNT', 'CODE_COUNT',
       'DISCUSSION_COUNT', 'AVG_NB_READ_TIME_MIN', 'REGISTRATION_IPV4',
       'REGISTRATION_LOCATION', 'TOTAL_VOTES_GAVE_NB', 'TOTAL_VOTES_GAVE_DS',
       'TOTAL_VOTES_GAVE_DC', 'ISBOT'],
      dtype='object')


In [151]:
# Get unique values for each column separately
name_unique = df['NAME'].unique()
gender_unique = df['GENDER'].unique()
email_unique = df['EMAIL_ID'].unique()
is_glogin_unique = df['IS_GLOGIN'].unique()
follower_count_unique = df['FOLLOWER_COUNT'].unique()
following_count_unique = df['FOLLOWING_COUNT'].unique()
dataset_count_unique = df['DATASET_COUNT'].unique()
code_count_unique = df['CODE_COUNT'].unique()
discussion_count_unique = df['DISCUSSION_COUNT'].unique()
avg_nb_read_time_min_unique = df['AVG_NB_READ_TIME_MIN'].unique()
registration_ipv4_unique = df['REGISTRATION_IPV4'].unique()
registration_location_unique = df['REGISTRATION_LOCATION'].unique()
total_votes_gave_nb_unique = df['TOTAL_VOTES_GAVE_NB'].unique()
total_votes_gave_ds_unique = df['TOTAL_VOTES_GAVE_DS'].unique()
total_votes_gave_dc_unique = df['TOTAL_VOTES_GAVE_DC'].unique()
isbot_unique = df['ISBOT'].unique()

# Print unique values for each column
print('Unique values in NAME:', name_unique)
print('Unique values in GENDER:', gender_unique)
print('Unique values in EMAIL_ID:', email_unique)
print('Unique values in IS_GLOGIN:', is_glogin_unique)
print('Unique values in FOLLOWER_COUNT:', follower_count_unique)
print('Unique values in FOLLOWING_COUNT:', following_count_unique)
print('Unique values in DATASET_COUNT:', dataset_count_unique)
print('Unique values in CODE_COUNT:', code_count_unique)
print('Unique values in DISCUSSION_COUNT:', discussion_count_unique)
print('Unique values in AVG_NB_READ_TIME_MIN:', avg_nb_read_time_min_unique)
print('Unique values in REGISTRATION_IPV4:', registration_ipv4_unique)
print('Unique values in REGISTRATION_LOCATION:', registration_location_unique)
print('Unique values in TOTAL_VOTES_GAVE_NB:', total_votes_gave_nb_unique)
print('Unique values in TOTAL_VOTES_GAVE_DS:', total_votes_gave_ds_unique)
print('Unique values in TOTAL_VOTES_GAVE_DC:', total_votes_gave_dc_unique)
print('Unique values in ISBOT:', isbot_unique)


Unique values in NAME: ['Johnny KerrThomas' 'Dwayne LarsenLara' nan ... 'Hector TerryLogan'
 'William CollinsMartinez' 'Susan WilliamsJimenez']
Unique values in GENDER: ['Male' 'Female' nan]
Unique values in EMAIL_ID: ['jacksonalan@example.com' 'calvin80@example.com' 'qbrown@example.net' ...
 'penningtondebra@example.net' 'kerrycastro@example.com'
 'jonesmisty@example.net']
Unique values in IS_GLOGIN: [False True nan]
Unique values in FOLLOWER_COUNT: [53. 16. 44. 23. 46.  2. 50. 65. nan 70. 49.  0. 45. 55. 34. 19. 21.  3.
 38. 57. 29. 10. 58. 61.  9. 64.  1. 48. 12. 32. 60. 27. 20. 18. 54. 37.
 63. 17. 26. 39. 56. 41. 51. 31. 62.  4. 25. 69. 59.  8. 15. 66. 36. 22.
 28.  6. 30. 24. 14. 67. 52. 11. 40. 13. 42. 43. 33. 68.  7. 35.  5. 47.]
Unique values in FOLLOWING_COUNT: [ 87.  67.  81. 114. 112.   2.  36.   1.  25.  99.  44.  18.  14.  nan
  15.  30.  62.  55.   3. 115.  84.   0. 120.  97.  17.  37.  68.  54.
  75.  34. 107.  90.  88.  91.  86.  38.  53.  63.  51.  35.  29.  23.
   4.

In [152]:
# Step 0: Before cleaning - Print basic info and summary
print("Before Cleaning:")
print(df.info())  # Info about columns, data types, and non-null counts
print("\nSummary statistics (before):")
print(df.describe(include='all'))  # Summary statistics before cleaning

# Step 1: Remove unnecessary columns
df = df.drop(columns=['Unnamed: 0'])

# Step 2: Handle missing values
# For categorical columns
df['NAME'] = df['NAME'].fillna('Unknown')
df['GENDER'] = df['GENDER'].fillna('Unknown')
df['EMAIL_ID'] = df['EMAIL_ID'].fillna('unknown@example.com')
df['IS_GLOGIN'] = df['IS_GLOGIN'].fillna(False)

# For numerical columns (e.g., filling NaNs with mean)
df['FOLLOWER_COUNT'] = df['FOLLOWER_COUNT'].fillna(df['FOLLOWER_COUNT'].mean())
df['FOLLOWING_COUNT'] = df['FOLLOWING_COUNT'].fillna(df['FOLLOWING_COUNT'].mean())
df['DATASET_COUNT'] = df['DATASET_COUNT'].fillna(0)  # Assuming 0 is a reasonable fill
df['CODE_COUNT'] = df['CODE_COUNT'].fillna(0)
df['DISCUSSION_COUNT'] = df['DISCUSSION_COUNT'].fillna(0)
df['AVG_NB_READ_TIME_MIN'] = df['AVG_NB_READ_TIME_MIN'].fillna(df['AVG_NB_READ_TIME_MIN'].mean())
df['TOTAL_VOTES_GAVE_NB'] = df['TOTAL_VOTES_GAVE_NB'].fillna(0)
df['TOTAL_VOTES_GAVE_DS'] = df['TOTAL_VOTES_GAVE_DS'].fillna(0)
df['TOTAL_VOTES_GAVE_DC'] = df['TOTAL_VOTES_GAVE_DC'].fillna(0)

# Step 3: Standardize categorical values (if needed)
df['GENDER'] = df['GENDER'].str.title()  # Capitalize gender values

# Step 4: Convert data types
df['FOLLOWER_COUNT'] = df['FOLLOWER_COUNT'].astype(int)
df['FOLLOWING_COUNT'] = df['FOLLOWING_COUNT'].astype(int)

# Step 5: Remove duplicates
df = df.drop_duplicates()

# Step 6: After cleaning - Print basic info and summary
print("\nAfter Cleaning:")
print(df.info())  # Info about columns, data types, and non-null counts
print("\nSummary statistics (after):")
print(df.describe(include='all'))  # Summary statistics after cleaning


Before Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321188 entries, 0 to 1321187
Data columns (total 17 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1321188 non-null  int64  
 1   NAME                   1243024 non-null  object 
 2   GENDER                 1243309 non-null  object 
 3   EMAIL_ID               1243374 non-null  object 
 4   IS_GLOGIN              1243272 non-null  object 
 5   FOLLOWER_COUNT         1243476 non-null  float64
 6   FOLLOWING_COUNT        1242743 non-null  float64
 7   DATASET_COUNT          1242621 non-null  float64
 8   CODE_COUNT             1243262 non-null  float64
 9   DISCUSSION_COUNT       1243466 non-null  float64
 10  AVG_NB_READ_TIME_MIN   1242872 non-null  float64
 11  REGISTRATION_IPV4      1242859 non-null  object 
 12  REGISTRATION_LOCATION  1242898 non-null  object 
 13  TOTAL_VOTES_GAVE_NB    1243483 non-null  float64
 14  T

In [153]:
# Handle missing values for REGISTRATION_IPV4 and REGISTRATION_LOCATION
df['REGISTRATION_IPV4'] = df['REGISTRATION_IPV4'].fillna('Unknown')
df['REGISTRATION_LOCATION'] = df['REGISTRATION_LOCATION'].fillna('Unknown')

# Handle missing values for ISBOT
df['ISBOT'] = df['ISBOT'].fillna('Unknown')  # Or you can set to False if that's more appropriate

# Optional: Convert float columns to int if applicable
float_columns = ['DATASET_COUNT', 'CODE_COUNT', 'DISCUSSION_COUNT', 'AVG_NB_READ_TIME_MIN',
                 'TOTAL_VOTES_GAVE_NB', 'TOTAL_VOTES_GAVE_DS', 'TOTAL_VOTES_GAVE_DC']

for col in float_columns:
    df[col] = df[col].astype(int)

# Check the updated DataFrame
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321188 entries, 0 to 1321187
Data columns (total 16 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   NAME                   1321188 non-null  object
 1   GENDER                 1321188 non-null  object
 2   EMAIL_ID               1321188 non-null  object
 3   IS_GLOGIN              1321188 non-null  bool  
 4   FOLLOWER_COUNT         1321188 non-null  int32 
 5   FOLLOWING_COUNT        1321188 non-null  int32 
 6   DATASET_COUNT          1321188 non-null  int32 
 7   CODE_COUNT             1321188 non-null  int32 
 8   DISCUSSION_COUNT       1321188 non-null  int32 
 9   AVG_NB_READ_TIME_MIN   1321188 non-null  int32 
 10  REGISTRATION_IPV4      1321188 non-null  object
 11  REGISTRATION_LOCATION  1321188 non-null  object
 12  TOTAL_VOTES_GAVE_NB    1321188 non-null  int32 
 13  TOTAL_VOTES_GAVE_DS    1321188 non-null  int32 
 14  TOTAL_VOTES_GAVE_DC    1321188 non

In [154]:
# Drop rows with NaN values in the ISBOT column
data_cleaned = df.dropna(subset=['ISBOT'])

# Now you can separate bots and non-bots
bots = data_cleaned[data_cleaned['ISBOT'] == True]
nonbots = data_cleaned[data_cleaned['ISBOT'] == False]
# Optional: print the counts
print("Number of bots:", bots.shape[0])
print("Number of non-bots:", nonbots.shape[0])

Number of bots: 332894
Number of non-bots: 909794


In [155]:
# Display the unique values in the ISBOT column
print(data_cleaned['ISBOT'].unique())

['Unknown' False True]


In [156]:
# Get the count of each value in the ISBOT column
print(data_cleaned['ISBOT'].value_counts())


ISBOT
False      909794
True       332894
Unknown     78500
Name: count, dtype: int64


In [157]:
# Remove rows where ISBOT is 'Unknown'
data_cleaned = data_cleaned[data_cleaned['ISBOT'] != 'Unknown']


In [158]:
# Convert ISBOT to binary (1 for True, 0 for False)
data_cleaned['ISBOT_BINARY'] = data_cleaned['ISBOT'].astype(int)


In [159]:
# Get the count of each value in the ISBOT column
print(data_cleaned['ISBOT'].value_counts())


ISBOT
False    909794
True     332894
Name: count, dtype: int64


In [160]:
# Map True to 0 and False to 1 in the ISBOT column
data_cleaned['ISBOT_BINARY'] = data_cleaned['ISBOT'].map({True: 0, False: 1})

# Verify the changes
print(data_cleaned[['ISBOT', 'ISBOT_BINARY']].head())

   ISBOT  ISBOT_BINARY
2  False             1
3  False             1
4  False             1
5   True             0
6  False             1


In [143]:
# Assuming 'data_cleaned' is your final DataFrame after cleaning
data_cleaned.to_csv('cleaned_data.csv', index=False)


NameError: name 'data_cleaned' is not defined