# Main

In [1]:
# this is the main of the project

# libraries
import pandas as pd
import geopandas
import os
import matplotlib
import sklearn
import numpy as np
import mglearn
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
import statistics as st
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score 
from sklearn.cluster import KMeans 
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import *
from scipy.cluster.hierarchy import linkage, dendrogram

# custom libraries
from Geographycal_functions import drop_non_geolocalised
from Geographycal_functions import localize_tweets
from Geographycal_functions import localize_USA
from Preprocessing_functions import parallelize_dataframe
from Preprocessing_functions import text_preprocessing
from Preprocessing_functions import text_mining
from Preprocessing_functions import safe_drop_attr



# visualize progresses
from tqdm.auto import tqdm
tqdm.pandas()


%matplotlib inline 

# Import data

In [2]:
if os.path.isfile(".\Processed_data\States_mined.csv"):                       # we can choose one of the two
    state_df = pd.read_csv(".\Processed_data\States_mined.csv")
    undone_flag = False
else:
    # import the raw data
    data_donald = pd.read_csv("hashtag_donaldtrump.csv", lineterminator='\n')
    data_joe = pd.read_csv("hashtag_joebiden.csv", lineterminator='\n')
    
    # joining the two datasets dropping duplicates!!
    data_all = pd.concat([data_joe,data_donald]).drop_duplicates().reset_index(drop=True)
    
    # flag to be used in the next steps of the project
    undone_flag = True 

# Geolocalization

In [3]:
if undone_flag:
    print("Geolocalization")
    data_all = drop_non_geolocalised(data_all, "lat", "long")
    geo_df = localize_tweets(data_all, "World Tweet data distribution")
    df = localize_USA(geo_df, "USA Tweets data distribution")

Geolocalization
The number of instances in the df is:  1747804
The number of instances after dropping the non localized records is:  801011


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4269
Right CRS: None

  gdf = geopandas.sjoin(usa, gdf, how="inner", op='contains')


The number of tweets available is: 388853 

<class 'pandas.core.frame.DataFrame'>


# Preprocessing and text mining
We considered only the data localized in the USA.

This diminished the number of instances by a lot.

In [4]:
if undone_flag:
    # data to drop
    to_be_deleted = ["tweet_id", "source", "user_id", "user_join_date", "user_location", "continent", "collected_at"]
    # drop
    df = safe_drop_attr(df, to_be_deleted)

In [None]:
if undone_flag:
    # Text mining
    df = text_preprocessing(df)
    
    # save those preliminary results, since run_time is high
    df.to_csv(".\Processed_data\Df_languages_detected.csv")
    
    df, lang_state_df = text_mining(df)

In [2]:
df_base = pd.read_csv("Df_languages_detected.csv")

In [None]:
df = df_base.copy()
df, lang_state_df = text_mining(df)



In [None]:
undonw_flag = True

In [None]:
if undone_flag:
    # save the data into a folder
    df.to_csv(".\Processed_data\Df_mined.csv")

In [None]:
if undone_flag:
    # get the lenght of the dataframe to normalize the data
    num_instances = len(df["STATE_NAME"])

# Create the ML dataframe

Since we are squeezing all our data into 51 elements, we want to add to each state as much information about the statistical population they represents, we are therefore adding some statistics to the dataframe.
Each statistic is related to

In [None]:
if undone_flag:
    df.drop(columns=["DRAWSEQ", "index_right", ],inplace=True)

In [None]:
if undone_flag:
    # initialize an empty list where to append the statistics
    state_list = []
    # group the data by state
    df_groupby = df.groupby(["STATE_NAME"])
    # averages
    state_list.append(df_groupby.mean(numeric_only=True).add_suffix("_Mean"))
    # skewness
    state_list.append(df_groupby.skew(numeric_only=True).add_suffix("_Skewness"))  
    # median
    state_list.append(df_groupby.median(numeric_only=True).add_suffix("_Median"))  
    # count the tweets and normalize the count wrt the total number of instances
    state_list.append(df_groupby.count().add_suffix("_Counts") / num_instances)  
    # variance
    state_list.append(df_groupby.var().add_suffix("_Variance"))
    # standard deviation
    state_list.append(df_groupby.std().add_suffix("_Std"))
    # % of english speakers
    state_list.append(lang_state_df)
    
    
    # concatenate all those data to create a large dataframe
    state_df = pd.concat(state_list, axis=1)
    lang_state_df
    
    
    

In [None]:
if undone_flag:
    state_df.to_csv(".\Processed_data\States_mined.csv")

In [None]:
state_df.head(5)

In [None]:
#my_scaler = StandardScaler()

In [None]:
#state_df_scaled = my_scaler.fit_transform(state_df.loc[:, state_df.columns != 'STATE_NAME'])

In [None]:
#state_df_scaled = pd.DataFrame(state_df_scaled, columns=state_df.loc[:, state_df.columns != 'STATE_NAME'].columns)

In [None]:
#state_df_scaled = pd.concat([state_df["STATE_NAME"], state_df_scaled], axis=1)

In [None]:
#state_df_scaled.to_csv("State_df_scaled.csv")