# Main

In [1]:
# this is the main of the project

# libraries
import pandas as pd
import geopandas
import os
import matplotlib
import sklearn
import numpy as np
import mglearn
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
import statistics as st
from sklearn import preprocessing
from sklearn.metrics import silhouette_score 
from sklearn.cluster import KMeans 
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import *
from scipy.cluster.hierarchy import linkage, dendrogram

# custom libraries
from Geographycal_functions import drop_non_geolocalised
from Geographycal_functions import localize_tweets
from Geographycal_functions import localize_USA
from Preprocessing_functions import parallelize_dataframe
from Preprocessing_functions import preprocessing
from Preprocessing_functions import safe_drop_attr

# visualize progresses
from tqdm.auto import tqdm
tqdm.pandas()


%matplotlib inline 

# Import data

In [2]:
if os.path.isfile(".\Processed_data\States_mined.csv"):                       # we can choose one of the two
    state_df = pd.read_csv(".\Processed_data\States_mined.csv")
    undone_flag = False
else:
    # import the raw data
    data_donald = pd.read_csv("hashtag_donaldtrump.csv", lineterminator='\n')
    data_joe = pd.read_csv("hashtag_joebiden.csv", lineterminator='\n')
    
    # joining the two datasets dropping duplicates!!
    data_all = pd.concat([data_joe,data_donald]).drop_duplicates().reset_index(drop=True)
    
    # flag to be used in the next steps of the project
    undone_flag = True 

# Geolocalization

In [3]:
if undone_flag:
    print("Geolocalization")
    data_all = drop_non_geolocalised(data_all, "lat", "long")
    geo_df = localize_tweets(data_all, "World Tweet data distribution")
    df = localize_USA(geo_df, "USA Tweets data distribution")

# Preprocessing and text mining
We considered only the data localized in the USA.

This diminished the number of instances by a lot.

In [4]:
if undone_flag:
    # data to drop
    to_be_deleted = ["tweet_id", "source", "user_id", "user_join_date", "user_location", "continent", "collected_at"]
    # drop
    df = safe_drop_attr(df, to_be_deleted)

In [5]:
if undone_flag:
    # Text mining
    df = parallelize_dataframe(df, preprocessing, n_cores=3)

In [6]:
if undone_flag:
    # save the data into a folder
    data_donald.to_csv(".\Processed_data\Df_mined.csv")

In [7]:
if undone_flag:
    print("The df's type is: {}. Therefore I save a copy and covert it into a pandas dataframe object".format(type(df)))
    geo_df = df.copy()
    df = pd.DataFrame(df)

# Create the ML dataframe

Since we are squeezing all our data into 51 elements, we want to add to each state as much information about the statistical population they represents, we are therefore adding some statistics to the dataframe.
Each statistic is related to

In [8]:
if undone_flag:
    df.drop(columns=["DRAWSEQ", "index_right", ],inplace=True)

In [9]:
if undone_flag:
    # initialize an empty list where to append the statistics
    state_list = []
    # group the data by state
    df_groupby = df.groupby("STATE_NAME")
    # averages
    state_list.append(df_groupby.mean(numeric_only=True).add_suffix("_Mean"))
    # skewness
    state_list.append(df_groupby.skew(numeric_only=True).add_suffix("_Skewness"))  
    # median
    state_list.append(df_groupby.median(numeric_only=True).add_suffix("_Median"))  
    # count the tweets
    state_list.append(df_groupby.count().add_suffix("_Counts"))  
    # variance
    state_list.append(df_groupby.var().add_suffix("_Variance"))
    # standard deviation
    state_list.append(df_groupby.std().add_suffix("_Std"))
    
    # concatenate all those data to create a large dataframe
    state_df = pd.concat(state_list, axis=1)
    

In [10]:
if undone_flag:
    state_df.to_csv(".\Processed_data\States_mined.csv")

In [11]:
state_df.head(5)

Unnamed: 0,STATE_NAME,likes_Mean,retweet_count_Mean,user_followers_count_Mean,TextBlob_Subjectivity_Mean,TextBlob_Polarity_Mean,likes_Skewness,retweet_count_Skewness,user_followers_count_Skewness,TextBlob_Subjectivity_Skewness,...,likes_Variance,retweet_count_Variance,user_followers_count_Variance,TextBlob_Subjectivity_Variance,TextBlob_Polarity_Variance,likes_Std,retweet_count_Std,user_followers_count_Std,TextBlob_Subjectivity_Std,TextBlob_Polarity_Std
0,Alabama,3.967799,0.972482,6975.041569,0.336137,0.083791,15.354061,14.302827,3.665861,0.460725,...,480.156549,28.248217,416129200.0,0.107459,0.082238,21.912475,5.314905,20399.244962,0.327809,0.286772
1,Alaska,1.191977,0.292264,3201.982808,0.331841,0.061126,8.201224,22.596758,4.311367,0.327775,...,18.468113,5.908721,110072200.0,0.090065,0.069299,4.297454,2.430786,10491.530379,0.300108,0.263247
2,Arizona,5.016849,1.124816,2947.507443,0.321295,0.068841,38.970872,27.822626,22.353001,0.517254,...,5229.406293,184.669137,148752100.0,0.096776,0.080643,72.314634,13.589302,12196.398804,0.311089,0.283978
3,Arkansas,1.371534,0.411275,4821.268022,0.324972,0.07269,8.065272,7.33319,5.191989,0.466071,...,21.512158,2.62163,426176100.0,0.100653,0.068388,4.63812,1.619145,20644.033938,0.317258,0.26151
4,California,5.424561,1.239721,7322.299407,0.318791,0.066259,95.096283,104.154739,32.793312,0.514186,...,13503.820612,1238.382601,11468730000.0,0.09922,0.07934,116.205941,35.190661,107092.171878,0.314991,0.281674
