# Main

In [1]:
# this is the main of the project

# libraries
import pandas as pd
import geopandas
import os
import matplotlib
import sklearn
import numpy as np
import mglearn
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
import statistics as st
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score 
from sklearn.cluster import KMeans 
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import *
from scipy.cluster.hierarchy import linkage, dendrogram

# custom libraries
from Geographycal_functions import drop_non_geolocalised
from Geographycal_functions import localize_tweets
from Geographycal_functions import localize_USA
from Preprocessing_functions import parallelize_dataframe
from Preprocessing_functions import text_preprocessing
from Preprocessing_functions import text_mining
from Preprocessing_functions import safe_drop_attr
from Preprocessing_functions import select_dates_tweets


# visualize progresses
from tqdm.auto import tqdm

%matplotlib inline 

# Import data

In [2]:
if os.path.isfile(".\Processed_data\States_mineddf_total.csv"):                       # we can choose one of the two
    state_df = pd.read_csv(".\Processed_data\States_mined.csv")
    # finish here
    undone_flag = False
else:
    # import the raw data
    data_donald = pd.read_csv("hashtag_donaldtrump.csv", lineterminator='\n')
    data_joe = pd.read_csv("hashtag_joebiden.csv", lineterminator='\n')
    
    # joining the two datasets dropping duplicates!!
    data_all = pd.concat([data_joe,data_donald]).drop_duplicates().reset_index(drop=True)
    
    # flag to be used in the next steps of the project
    undone_flag = True 

# Preprocessing - Geolocalization

In [3]:
if undone_flag:
    print("Geolocalization")
    data_all = drop_non_geolocalised(data_all, "lat", "long")
    geo_df = localize_tweets(data_all, "World Tweet data distribution")
    df = localize_USA(geo_df, "USA Tweets data distribution")

# Preprocessing - text mining
We considered only the data localized in the USA.

This diminished the number of instances by a lot.

In [4]:
if undone_flag:
    # data to drop
    to_be_deleted = ["tweet_id", "source", "user_id", "user_join_date", "user_location", "continent", "collected_at"]
    # drop
    df = safe_drop_attr(df, to_be_deleted)

In [5]:
if undone_flag:
    # Text mining
    df = text_preprocessing(df)
    
    # save those preliminary results, since run_time is high
    df.to_csv(".\Processed_data\Df_languages_detected.csv")
    
    df, lang_state_df = text_mining(df)

In [6]:
if undone_flag:
    # save the data into a folder
    df.to_csv(".\Processed_data\Df_mined.csv")
    lang_state_df.to_csv(".\Processed_data\lang_share_df.csv")

In [7]:
if undone_flag:
    # get the lenght of the dataframe to normalize the data
    num_instances = len(df["STATE_NAME"])

# Select different time-frames from our data

In particular we select the dates before the last public debate, before the elections and after the election day

In [8]:
undone_flag = True
df = pd.read_csv(".\Processed_data\Df_mined.csv")

#lang_state_df = pd.read_csv(".\Processed_data\States_mined.csv", usecols=["STATE_NAME", "%_english"])
print(df.shape)

# get the temporal division!
df_last_debate, df_election_day, df_total = select_dates_tweets(df)
print("Number of tweets till the day of the last debate: ", len(df_last_debate))
print("Number of tweets till the day of the elections: ", len(df_election_day))
print("Number of total tweets: ", len(df_total))

(388853, 21)
Number of tweets till the day of the last debate:  88636
Number of tweets till the day of the elections:  263306
Number of total tweets:  388853


In [9]:
my_dfs = [df_last_debate, df_election_day, df_total]
supp_names = ["df_last_debate", "df_election_day", "df_total"]

# Create the ML dataframe

Since we are squeezing all our data into 51 elements, we want to add to each state as much information about the statistical population they represents, we are therefore adding some statistics to the dataframe.
Each statistic is related to

In [10]:
if undone_flag:
    for df in my_dfs:
        df.drop(columns=["DRAWSEQ", "index_right", "Unnamed: 0", "Unnamed: 0.1",
                         "STATE_FIPS", "SUB_REGION", "STATE_ABBR", "geometry", "created_at",
                        "user_screen_name", "user_description", "clean_tweet", "Languages", "user_name"],inplace=True)

In [12]:
if undone_flag:
    # creating dictionaries
    state_df = {}
    supp = 0
    
    # initialize the scaler
    my_scaler = StandardScaler()
    
    for df in my_dfs:
        # visualize the bar 
        tqdm.pandas(desc="Statistical mining: ")
        num_instances = len(df)
    
        # initialize an empty list where to append the statistics
        state_list = []
        # group the data by state
        df_groupby = df.groupby(["STATE_NAME"])
        # averages
        state_list.append(df_groupby.mean(numeric_only=True).add_suffix("_Mean"))
        # skewness
        state_list.append(df_groupby.skew(numeric_only=True).add_suffix("_Skewness"))  
        # median
        state_list.append(df_groupby.median(numeric_only=True).add_suffix("_Median"))  
        # count the tweets and normalize the count wrt the total number of instances
        # state_list.append(df_groupby.count().add_suffix("_Counts") / num_instances)  
        # variance
        state_list.append(df_groupby.var().add_suffix("_Variance"))
        # standard deviation
        state_list.append(df_groupby.std().add_suffix("_Std"))
        # 0.1 quantile
        state_list.append(df_groupby.quantile(q=0.1).add_suffix("_0_1_quant"))
        # 0.9 quantile
        state_list.append(df_groupby.quantile(q=0.9).add_suffix("_0_9_quant"))
        
        
        # concatenate all those data to create a large dataframe
        state_df[supp_names[supp]] = pd.concat(state_list, axis=1)
        
        # % of english speakers
        # state_df[supp_names[supp]].merge(lang_state_df, on="STATE_NAME")
        
        # create a key for the non categorical values
        logic = [col != "STATE_NAME" for col in state_df[supp_names[supp]].columns]
        original_key = state_df[supp_names[supp]].columns 
        key = state_df[supp_names[supp]].columns[logic]
        
        ## scale
        #state_df[supp_names[supp] + "_scaled"] = my_scaler.fit_transform(
        #    state_df[supp_names[supp]].loc[:, key])
        #
        ## trasform in a df
        #state_df[supp_names[supp] + "_scaled"] = pd.DataFrame(
        #    state_df[supp_names[supp] + "_scaled"], columns=key)
        #
        ## add the state name
        #state_df[supp_names[supp] + "_scaled"] = pd.concat(
        #    [pd.Series(df_groupby.groups.keys()), state_df[supp_names[supp] + "_scaled"]], axis=1)
        
        # save
        state_df[supp_names[supp]].to_csv(".\Processed_data\States_mined" + supp_names[supp] + ".csv")
        
        # update index
        supp += 1

  state_list.append(df_groupby.quantile(q=0.1).add_suffix("_0_1_quant"))
  state_list.append(df_groupby.quantile(q=0.9).add_suffix("_0_9_quant"))
  state_list.append(df_groupby.quantile(q=0.1).add_suffix("_0_1_quant"))
  state_list.append(df_groupby.quantile(q=0.9).add_suffix("_0_9_quant"))
  state_list.append(df_groupby.quantile(q=0.1).add_suffix("_0_1_quant"))
  state_list.append(df_groupby.quantile(q=0.9).add_suffix("_0_9_quant"))


In [13]:
state_df["df_election_day"].head(5)

Unnamed: 0_level_0,likes_Mean,retweet_count_Mean,user_followers_count_Mean,TextBlob_Subjectivity_Mean,TextBlob_Polarity_Mean,likes_Skewness,retweet_count_Skewness,user_followers_count_Skewness,TextBlob_Subjectivity_Skewness,TextBlob_Polarity_Skewness,...,likes_0_1_quant,retweet_count_0_1_quant,user_followers_count_0_1_quant,TextBlob_Subjectivity_0_1_quant,TextBlob_Polarity_0_1_quant,likes_0_9_quant,retweet_count_0_9_quant,user_followers_count_0_9_quant,TextBlob_Subjectivity_0_9_quant,TextBlob_Polarity_0_9_quant
STATE_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,3.875874,1.09528,8374.406469,0.339647,0.079804,14.6491,13.358431,3.177572,0.446737,0.430516,...,0.0,0.0,23.0,0.0,-0.1575,6.0,2.0,8958.0,0.8,0.5
Alaska,0.898662,0.271511,2464.818356,0.341844,0.051959,9.46018,21.212594,5.306806,0.332601,0.554995,...,0.0,0.0,11.0,0.0,-0.199062,2.0,0.8,4055.0,0.75,0.4
Arizona,2.654507,0.738056,2924.439398,0.319254,0.061104,15.211403,13.855179,12.340654,0.509926,0.096939,...,0.0,0.0,32.0,0.0,-0.207593,3.0,1.0,7481.0,0.75,0.413333
Arkansas,1.207643,0.449682,6192.387261,0.324843,0.060139,8.892309,7.217228,4.383891,0.428717,0.56768,...,0.0,0.0,41.8,0.0,-0.185227,2.0,1.0,2673.4,0.7,0.34125
California,4.30274,1.139293,6724.195873,0.320105,0.051099,106.40244,115.622329,33.325834,0.505455,0.206356,...,0.0,0.0,25.0,0.0,-0.225,3.0,1.0,6004.0,0.75,0.4


In [None]:
state_df["df_election_day"].columns

In [None]:
state_df.head(5)

In [None]:
my_scaler = StandardScaler()

In [None]:
state_df_scaled = my_scaler.fit_transform(state_df.loc[:, state_df.columns != 'STATE_NAME'])

In [None]:
state_df_scaled = pd.DataFrame(state_df_scaled, columns=state_df.loc[:, state_df.columns != 'STATE_NAME'].columns)

In [None]:
state_df_scaled = pd.concat([state_df["STATE_NAME"], state_df_scaled], axis=1)

In [None]:
state_df_scaled.to_csv("State_df_scaled.csv")