# Main

In [15]:
# this is the main of the project

# libraries
import pandas as pd
import geopandas

# custom libraries
from Geographycal_functions import drop_non_geolocalised
from Geographycal_functions import localize_tweets
from Geographycal_functions import localize_USA
from Preprocessing_functions import parallelize_dataframe
from Preprocessing_functions import preprocessing
from Preprocessing_functions import safe_drop_attr

# visualize progresses
from tqdm.auto import tqdm
tqdm.pandas()

# Import data

In [2]:
# import the raw data
data_donald = pd.read_csv("hashtag_donaldtrump.csv", lineterminator='\n')
data_joe = pd.read_csv("hashtag_joebiden.csv", lineterminator='\n')

# Geolocalization

In [3]:
# custom geololization

# geolocalize Trump
print("Donald Trump")
data_donald = drop_non_geolocalised(data_donald, "lat", "long")
geo_donald = localize_tweets(data_donald, "World Trump data distribution")
data_donald = localize_USA(geo_donald, "USA Trump data distribution")

# geolocalize Joe
print("Joe Biden")
data_joe = drop_non_geolocalised(data_joe, "lat", "long")
geo_joe = localize_tweets(data_joe, "World Joe data distribution")
data_joe = localize_USA(geo_joe, "USA Joe data distribution")




Donald Trump
The number of instances in the df is:  970919
The number of instances after dropping the non localized records is:  445719


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4269
Right CRS: None

  gdf = geopandas.sjoin(usa, gdf, how="inner", op='contains')


The number of tweets available is: 210189 

Joe Biden
The number of instances in the df is:  776886
The number of instances after dropping the non localized records is:  355293


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4269
Right CRS: None

  gdf = geopandas.sjoin(usa, gdf, how="inner", op='contains')


The number of tweets available is: 178665 



# Preprocessing and text mining
We considered only the data localized in the USA.

This diminished the number of instances by a lot.

In [4]:
# data to drop
to_be_deleted = ["tweet_id", "source", "user_id", "user_join_date", "user_location", "continent", "collected_at"]

data_donald = safe_drop_attr(data_donald, to_be_deleted)
data_joe = safe_drop_attr(data_joe, to_be_deleted)

In [5]:
# Text mining
data_donald = parallelize_dataframe(data_donald, preprocessing, n_cores=3)
data_joe = parallelize_dataframe(data_joe, preprocessing, n_cores=3)


In [12]:
# save the data into a folder (this need to be modified!!!)
# data_donald.to_csv("D:\Data_DDS\Donald_mined.csv")
# data_joe.to_csv("D:\Data_DDS\Joe_mined.csv")

# Create the ML dataframe

In [13]:
# group the data by state
state_donald_avg = data_donald.groupby("STATE_NAME").mean(numeric_only=True)
state_joe_avg = data_joe.groupby("STATE_NAME").mean(numeric_only=True)


In [14]:
state_donald_avg.drop(columns= ["DRAWSEQ", "index_right", ],inplace=True)
state_joe_avg.drop(columns= ["DRAWSEQ", "index_right", ],inplace=True)

state_donald_avg.to_csv("D:\Data_DDS\States_donald_mined.csv")
state_joe_avg.to_csv("D:\Data_DDS\States_joe_mined.csv")


In [20]:
gdp = pd.read_csv("D:\Data_DDS\Joe_mined.csv")

In [21]:
gdp

Unnamed: 0.1,Unnamed: 0,STATE_NAME,DRAWSEQ,STATE_FIPS,SUB_REGION,STATE_ABBR,geometry,index_right,created_at,likes,retweet_count,user_name,user_screen_name,user_description,user_followers_count,clean_tweet,Languages,TextBlob_Subjectivity,TextBlob_Polarity,TextBlob_Analysis
0,0,Hawaii,1,15,Pacific,HI,MULTIPOLYGON (((-160.0738033454681 22.00417734...,23023,2020-10-16 08:11:08,0.0,0.0,Debbi ~Freedom is wearing a mask to protect ot...,OutcomeOptimist,"Be kind, but stand tall. Think for yourself. F...",71.0,watched both town halls i can say that one f...,"{'language': 'en', 'score': 0.9999967960431493}",0.225000,0.025000,Positive
1,0,Hawaii,1,15,Pacific,HI,MULTIPOLYGON (((-160.0738033454681 22.00417734...,164345,2020-10-24 21:41:51,0.0,0.0,Kathy😷(#WashYourFuckingHands)😷,everybodyzzmama,This shit is exhausting. I’m pissed off & swea...,15053.0,sarahbcalif heard he went to the villages am...,"{'language': 'en', 'score': 0.9999971863555244}",0.900000,-0.700000,Negative
2,0,Hawaii,1,15,Pacific,HI,MULTIPOLYGON (((-160.0738033454681 22.00417734...,661390,2020-11-07 19:10:19,6.0,2.0,Ms B Havin 🌊👊 #TheResistance,MsBehavinMarin,Retired beach dweller 😎 Lover of whales 🐋 Rock...,14412.0,now we can call them president elect amp pre...,"{'language': 'en', 'score': 0.5714262355639195}",0.900000,1.000000,Positive
3,0,Hawaii,1,15,Pacific,HI,MULTIPOLYGON (((-160.0738033454681 22.00417734...,460947,2020-11-05 03:26:35,7.0,3.0,Ms B Havin 🌊👊 #TheResistance,MsBehavinMarin,Retired beach dweller 😎 Lover of whales 🐋 Rock...,14407.0,today we reached a record of new covi...,"{'language': 'en', 'score': 0.999994884189061}",0.379708,0.132900,Positive
4,0,Hawaii,1,15,Pacific,HI,MULTIPOLYGON (((-160.0738033454681 22.00417734...,769789,2020-11-08 17:35:24,0.0,0.0,Mind Cosmos,MindCosmos,3L,203.0,if you re not a democrat when you re you h...,"{'language': 'en', 'score': 0.9999985752708884}",0.000000,0.000000,Neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178660,50,Alaska,51,2,Pacific,AK,MULTIPOLYGON (((-161.3337853466416 58.73324838...,17983,2020-10-16 01:31:38,0.0,0.0,Tony Bobulinski,buckdancer57,#Walkaway #MAGA #Platinumplan,131.0,joebiden gang strikes again https t co l o...,"{'language': 'nl', 'score': 0.7142831521903917}",0.000000,0.000000,Neutral
178661,50,Alaska,51,2,Pacific,AK,MULTIPOLYGON (((-161.3337853466416 58.73324838...,185245,2020-10-26 11:06:32,0.0,0.0,Tony Bobulinski,buckdancer57,#Walkaway #MAGA #Platinumplan,130.0,https t co ihizj stbu great video for gettin...,"{'language': 'en', 'score': 0.9999982385241484}",0.694444,-0.077778,Negative
178662,50,Alaska,51,2,Pacific,AK,MULTIPOLYGON (((-161.3337853466416 58.73324838...,456202,2020-11-05 01:31:11,0.0,0.0,Tony Bobulinski,buckdancer57,#Walkaway #MAGA #Platinumplan,135.0,the deceased love joebiden the rest of us suf...,"{'language': 'en', 'score': 0.9999953044153389}",0.600000,0.500000,Positive
178663,50,Alaska,51,2,Pacific,AK,MULTIPOLYGON (((-161.3337853466416 58.73324838...,142833,2020-10-23 13:10:44,0.0,0.0,Tony Bobulinski,buckdancer57,#Walkaway #MAGA #Platinumplan,130.0,fact check joebiden did not run against abr...,"{'language': 'en', 'score': 0.9999974243278751}",1.000000,0.600000,Positive
