In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

## Inspection

In [4]:
# Read files 
tweets = pd.read_csv("../Data/cresci-2017.csv/datasets_full.csv/traditional_spambots_1.csv/traditional_spambots_1.csv/tweets.csv", encoding='utf-8')
users = pd.read_csv("../Data/cresci-2017.csv/datasets_full.csv/traditional_spambots_1.csv/traditional_spambots_1.csv/users.csv", encoding='utf-8')


In [5]:
# If I want to loop some values
files_dict = {
    'tweets': tweets,
    'users': users
}

In [6]:
# Initial look of each one 
for name, df in files_dict.items():
    print(f"Dataframe name: {name}")
    print(df.head(),"\n\n")

Dataframe name: tweets
            id                                               text  \
0  22642586115     CPPRI Recruitment 2010 at http://ping.fm/yp8zH   
1  22642583483  National Games Secretariat Recruitment 2010  :...   
2  22642524678     CIPET Recruitment Jobs at http://ping.fm/KnFCa   
3  22642504361      DIAT Recruitment 2010 at http://ping.fm/huS9m   
4  22642475789       BHEL Recruitment 2010 : http://ping.fm/PLWWA   

                                              source  user_id  truncated  \
0  <a href="http://www.ping.fm/" rel="nofollow">P...  7248952        NaN   
1  <a href="http://www.ping.fm/" rel="nofollow">P...  7248952        NaN   
2  <a href="http://www.ping.fm/" rel="nofollow">P...  7248952        NaN   
3  <a href="http://www.ping.fm/" rel="nofollow">P...  7248952        NaN   
4  <a href="http://www.ping.fm/" rel="nofollow">P...  7248952        NaN   

   in_reply_to_status_id  in_reply_to_user_id in_reply_to_screen_name  \
0                      0        

In [7]:
# Shapes
for name, df in files_dict.items():
    print(f"Dataframe name: {name}")
    print(df.shape,"\n\n")

Dataframe name: tweets
(145094, 25) 


Dataframe name: users
(1000, 40) 




In [8]:
# Info
for name, df in files_dict.items():
    print(f"Dataframe name: {name}")
    print(df.info(),"\n\n")

Dataframe name: tweets
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145094 entries, 0 to 145093
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       145094 non-null  int64  
 1   text                     145094 non-null  object 
 2   source                   145094 non-null  object 
 3   user_id                  145094 non-null  int64  
 4   truncated                753 non-null     float64
 5   in_reply_to_status_id    145094 non-null  int64  
 6   in_reply_to_user_id      145094 non-null  int64  
 7   in_reply_to_screen_name  11286 non-null   object 
 8   retweeted_status_id      145094 non-null  int64  
 9   geo                      0 non-null       float64
 10  place                    0 non-null       float64
 11  contributors             0 non-null       float64
 12  retweet_count            145094 non-null  int64  
 13  reply_count              145094 non-

In [9]:
# Understand each feature
for column in tweets.columns:
    print(f"COLUMN {column}")
    print(tweets[column].head(), "\n\n")


COLUMN id
0    22642586115
1    22642583483
2    22642524678
3    22642504361
4    22642475789
Name: id, dtype: int64 


COLUMN text
0       CPPRI Recruitment 2010 at http://ping.fm/yp8zH
1    National Games Secretariat Recruitment 2010  :...
2       CIPET Recruitment Jobs at http://ping.fm/KnFCa
3        DIAT Recruitment 2010 at http://ping.fm/huS9m
4         BHEL Recruitment 2010 : http://ping.fm/PLWWA
Name: text, dtype: object 


COLUMN source
0    <a href="http://www.ping.fm/" rel="nofollow">P...
1    <a href="http://www.ping.fm/" rel="nofollow">P...
2    <a href="http://www.ping.fm/" rel="nofollow">P...
3    <a href="http://www.ping.fm/" rel="nofollow">P...
4    <a href="http://www.ping.fm/" rel="nofollow">P...
Name: source, dtype: object 


COLUMN user_id
0    7248952
1    7248952
2    7248952
3    7248952
4    7248952
Name: user_id, dtype: int64 


COLUMN truncated
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: truncated, dtype: float64 


COLUMN in_reply_to_status_id
0    0
1   

In [10]:
# Understand each feature
for column in users.columns:
    print(f"COLUMN {column}")
    print(users[column].head(), "\n\n")

COLUMN id
0     7248952
1     7732472
2     9524952
3    10788822
4    14596967
Name: id, dtype: int64 


COLUMN name
0     Bhuvan Chand
1    Daniel Wagner
2      Andrew Lock
3     Tim Thompson
4          fxgenie
Name: name, dtype: object 


COLUMN screen_name
0        tarunkjuyal
1       DanielWagner
2         Andrewlock
3    yourinsaneworld
4            fxgenie
Name: screen_name, dtype: object 


COLUMN statuses_count
0    1259
1     770
2    1100
3    6497
4    3203
Name: statuses_count, dtype: int64 


COLUMN followers_count
0      837
1     3274
2    38849
3     5902
4     2570
Name: followers_count, dtype: int64 


COLUMN friends_count
0     1978
1     3595
2    34504
3     5496
4     2638
Name: friends_count, dtype: int64 


COLUMN favourites_count
0    3200
1       8
2      41
3       0
4       0
Name: favourites_count, dtype: int64 


COLUMN listed_count
0       9
1      22
2    1014
3      82
4       5
Name: listed_count, dtype: int64 


COLUMN url
0                   http://

Features to Keep

In [11]:
Tweets_features = ["user_id", "retweet_count", "reply_count", "favorite_count", "num_hashtags", "num_urls", "num_mentions"]
Users_features = ["id", "statuses_count", "followers_count", "friends_count", "favourites_count", "listed_count", "created_at"]


## Cleaning

In [12]:
#tweets = tweets[Tweets_features]
users = users[Users_features]

Convert Data Types

In [14]:
users['created_at']

0      1183552203000L
1      1185440851000L
2      1192725360000L
3      1196614406000L
4      1209536534000L
            ...      
995    1267038541000L
996    1267067845000L
997    1267077720000L
998    1267078126000L
999    1268008058000L
Name: created_at, Length: 1000, dtype: object

In [13]:
users['created_at'] = pd.to_datetime(users['created_at'])

ParserError: Unknown string format: 1183552203000L present at position 0

Missing Values

In [106]:
# Fill missing values for all numeric columns in tweets DataFrame
for col in tweets.columns:
    if pd.api.types.is_numeric_dtype(tweets[col]):
        tweets[col] = tweets[col].fillna(0)

# Fill missing values for all numeric columns in users DataFrame
for col in users.columns:
    if pd.api.types.is_numeric_dtype(users[col]):
        users[col] = users[col].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets[col] = tweets[col].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users[col] = users[col].fillna(0)


Feature Engeniering in Users

In [107]:
users['account_age_years'] = 2015 - users['created_at'].dt.year
users['followers_to_friends_ratio'] = users['followers_count'] / users['friends_count']
users['followers_to_friends_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
users['followers_to_friends_ratio'] = users['followers_to_friends_ratio'].fillna(0)  # Handle division by zero

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users['account_age_years'] = 2015 - users['created_at'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users['followers_to_friends_ratio'] = users['followers_count'] / users['friends_count']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users['followers_to_friends_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be s

Drop Date

In [108]:
users = users.drop(["created_at"], axis=1)

Aggregate Tweet

In [109]:
tweets["num_tweets"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets["num_tweets"] = 1


In [110]:
tweets = tweets.groupby(['user_id']).sum().reset_index()

In [111]:
tweets["retweet_ratio"] = tweets["retweet_count"]/tweets["num_tweets"]
tweets["reply_ration"] = tweets["reply_count"]/tweets["num_tweets"]

Normalize

In [112]:
scaler = MinMaxScaler()
users.iloc[:,1:] = scaler.fit_transform(users.iloc[:,1:])
tweets.iloc[:,1:] = scaler.fit_transform(tweets.iloc[:,1:])


Merge

In [120]:
merged_df = pd.merge(tweets, users, left_on='user_id', right_on='id', how='inner')


Save Data 

In [126]:
# Define the new folder path
new_folder_path = './E13.csv/clean'

# Create the folder if it does not exist
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

merged_df.to_csv(f'{new_folder_path}/clean_merged.csv', index=False, encoding='utf-8')
users.to_csv(f'{new_folder_path}/clean_users.csv', index=False, encoding='utf-8')

## Loop Across Folders

In [17]:
main_folders

['genuine_accounts.csv',
 'social_spambots_1.csv',
 'social_spambots_2.csv',
 'social_spambots_3.csv']

In [None]:
# Base directory where your main folders (genuine_accounts.csv, etc.) are located
base_directory = "../Data/cresci-2017.csv/datasets_full.csv/"



In [16]:
# Base directory where your main folders (genuine_accounts.csv, etc.) are located
base_directory = "../Data/cresci-2017.csv/datasets_full.csv/"

# List all main folders in the base directory
main_folders = [f for f in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, f))]

# Loop through each main folder
for main_folder in main_folders:

    # Construct the path to the nested folder, assuming the repeated structure
    nested_folder_path = os.path.join(base_directory, main_folder, main_folder)
    
    # Construct the full paths to tweets.csv and users.csv within the nested folder
    tweets_path = os.path.join(nested_folder_path, "tweets.csv")
    users_path = os.path.join(nested_folder_path, "users.csv")

    # Now you can process these datasets as needed
    print(f"Processing datasets in {main_folder}...")
    
    # Load the datasets
    tweets = pd.read_csv(tweets_path, encoding='utf-8')
    users = pd.read_csv(users_path, encoding='utf-8')

    # Example processing: Just printing out the number of rows in each file
    print(f"Tweets: {tweets.shape[0]} rows, Users: {users.shape[0]} rows")

    # Reduce the features 
    Tweets_features = ["user_id", "retweet_count", "reply_count", "favorite_count", "num_hashtags", "num_urls", "num_mentions"]
    Users_features = ["id", "statuses_count", "followers_count", "friends_count", "favourites_count", "listed_count", "created_at"]
    tweets = tweets[Tweets_features]
    users = users[Users_features]

    # Convert Data Type
    users['created_at'] = pd.to_datetime(users['created_at'])

    # Missing Values
    # Fill missing values for all numeric columns in tweets DataFrame
    for col in tweets.columns:
        if pd.api.types.is_numeric_dtype(tweets[col]):
            tweets[col] = tweets[col].fillna(0)

    # Fill missing values for all numeric columns in users DataFrame
    for col in users.columns:
        if pd.api.types.is_numeric_dtype(users[col]):
            users[col] = users[col].fillna(0)

    # User Feature Eng
    users['account_age_years'] = 2017 - users['created_at'].dt.year
    users['followers_to_friends_ratio'] = users['followers_count'] / users['friends_count']
    users['followers_to_friends_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
    users['followers_to_friends_ratio'] = users['followers_to_friends_ratio'].fillna(0)

    # Drop Date
    users = users.drop(["created_at"], axis=1)

    # Aggregate Tweet
    tweets["num_tweets"] = 1
    tweets = tweets.groupby(['user_id']).sum().reset_index()

    # Feature Eng Tweets
    tweets["retweet_ratio"] = tweets["retweet_count"]/tweets["num_tweets"]
    tweets["reply_ration"] = tweets["reply_count"]/tweets["num_tweets"]

    # Normalize
    scaler = MinMaxScaler()
    users.iloc[:,1:] = scaler.fit_transform(users.iloc[:,1:])
    tweets.iloc[:,1:] = scaler.fit_transform(tweets.iloc[:,1:])

    # Merge
    merged_df = pd.merge(tweets, users, left_on='user_id', right_on='id', how='inner')

    # Add bot feature
    if main_folder == 'genuine_accounts.csv':
        merged_df["bot"] = 0
        users["bot"] = 0
    else:
        merged_df["bot"] = 1
        users["bot"] = 1

    # Define the new folder path
    new_folder_path = f'./{main_folder}/{main_folder}/clean'

    # Create the folder if it does not exist
    if not os.path.exists(new_folder_path):
        os.makedirs(new_folder_path)

    merged_df.to_csv(f'{new_folder_path}/clean_merged.csv', index=False, encoding='utf-8')
    users.to_csv(f'{new_folder_path}/clean_users.csv', index=False, encoding='utf-8')
    print("******FILES SAVED********\n\n")



Processing datasets in genuine_accounts.csv...


  tweets = pd.read_csv(tweets_path, encoding='utf-8')


Tweets: 2839362 rows, Users: 3474 rows
******FILES SAVED********


Processing datasets in social_spambots_1.csv...


  tweets = pd.read_csv(tweets_path, encoding='utf-8')


Tweets: 1610034 rows, Users: 991 rows


  tweets.iloc[:,1:] = scaler.fit_transform(tweets.iloc[:,1:])


******FILES SAVED********


Processing datasets in social_spambots_2.csv...


  tweets = pd.read_csv(tweets_path, encoding='utf-8')


Tweets: 428542 rows, Users: 3457 rows


  tweets.iloc[:,1:] = scaler.fit_transform(tweets.iloc[:,1:])


******FILES SAVED********


Processing datasets in social_spambots_3.csv...


  tweets = pd.read_csv(tweets_path, encoding='utf-8')


Tweets: 1418557 rows, Users: 464 rows
******FILES SAVED********




  tweets.iloc[:,1:] = scaler.fit_transform(tweets.iloc[:,1:])


Add Bot

In [115]:
if folder == 'E13.csv' or folder == 'TFP.csv':
    merged_df["bot"] = 0
    users["bot"] = 0
else:
    merged_df["bot"] = 1
    users["bot"] = 1


['E13.csv', 'FSF.csv', 'INT.csv', 'TFP.csv', 'TWT.csv']