# Import Packages

Begin by downloading hydrated dataset to `/data` from:
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/5QCCUU

In [5]:
# import twint
import nest_asyncio
nest_asyncio.apply()
import pandas as pd
import numpy as np
import yfinance as yf
import json
import requests
import datetime
import bigjson
import sqlite3
import string
import pickle

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import GridSearchCV

### Run the code below to dehydrate the dataset and split into manageable blocks

In [2]:
# ! cd data/split
# ! twarc hydrate climate_id.txt.00 > hdrate.jsonl
# ! split -l 200000 hdrate.jsonl

#### Move split files to folder `/data/split`

### Define function for cleaning datasets

In [15]:
def clean_tweets(tweets):
    # remove retweets
    tweets = tweets[tweets['retweeted_status'].isna()].copy()
    
    # take nested info and bring to own list for appending to dataframe
    names = [key['name'] for key in tweets['user']]
    screen_names = [key['screen_name'] for key in tweets['user']]
    locations = [key['location'] for key in tweets['user']]
    follower_counts = [key['followers_count'] for key in tweets['user']]
    user_created_at = [key['created_at'] for key in tweets['user']]
    verified = [key['verified'] for key in tweets['user']]
    statuses_counts = [key['statuses_count'] for key in tweets['user']]
    
    location = [key['full_name'] if key else None for key in tweets['place']]
    country = [key['country'] if key else None for key in tweets['place']]
    
    hashtag_mess = [row['hashtags'] for row in tweets['entities']]
    hashtags = [[dct['text'] for dct in lst] if len(lst) > 0 else None for lst in hashtag_mess]
    
    add_cols = {'names': names, 'screen_names': screen_names, 'locations': locations, 
                'follower_counts': follower_counts, 'user_created_at': user_created_at, 'verified': verified,
                'statuses_counts': statuses_counts, 'location': location, 'country': country, 'hashtags': hashtags}
    
    # select columns to be dropped
    drop_cols = ['id_str', 'display_text_range', 'entities', 'source', 'in_reply_to_status_id', 
                 'in_reply_to_status_id_str','in_reply_to_user_id_str', 'user', 'geo', 'coordinates', 
                 'place', 'contributors', 'in_reply_to_user_id', 'quoted_status_id', 'quoted_status_id_str', 
                 'quoted_status_permalink', 'quoted_status', 'favorited', 'retweeted', 'possibly_sensitive', 
                 'extended_entities', 'retweeted_status']
    
    # select kept columns
    cols = [x for x in tweets.columns if x not in drop_cols] + list(add_cols.keys())
    
    # add nested features directly to DataFrame 
    for key, value in add_cols.items():
        tweets[key] = value
    
    # clean date formats
    tweets['created_at'] = pd.to_datetime(tweets['created_at'].dt.strftime("%Y-%m-%d %H:%M:%S"))
    # reformat date and leave as string
    tweets['user_created_at'] = pd.to_datetime(tweets['user_created_at']).dt.strftime("%Y-%m-%d")
    
    return tweets[cols]
    

In [5]:
tweets = pd.DataFrame()
for letter in string.ascii_lowercase:
    print(f"start xa{letter}")
    tweets = pd.concat([tweets, clean_tweets(pd.read_json(f'data/split/xa{letter}', lines=True))])
for letter in string.ascii_lowercase[:9]:
    print(f"start xb{letter}")
    tweets = pd.concat([tweets, clean_tweets(pd.read_json(f'data/split/xb{letter}', lines = True))])

start xaa
start xab
start xac
start xad
start xae
start xaf
start xag
start xah
start xai
start xaj
start xak
start xal
start xam
start xan
start xao
start xap
start xaq
start xar
start xas
start xat
start xau
start xav
start xaw
start xax
start xay
start xaz
start xba
start xbb
start xbc
start xbd
start xbe
start xbf
start xbg
start xbh
start xbi


### Save concatenated DataFrame for next notebook

In [None]:
# reset index of concatenated dataFrame
tweets.reset_index(inplace=True)
tweets.drop(columns = 'index', inplace=True)

In [14]:
tweets.to_csv('data/cleaned.csv')

In [12]:
pickle_out = open('data/pickle_jar/cleaned.pkl', 'wb')
pickle.dump(tweets, pickle_out)
pickle_out.close()
    

In [13]:
tweets.shape

(1970258, 21)

In [None]:
def sentiment_score(sentence): 
  
    # Instntiate SentimentIntensityAnalyzer object
    sid_obj = SentimentIntensityAnalyzer() 
  
    # store scoring data in dictionary  
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    
    return sentiment_dict['compound']

In [None]:
tweets['vader'] = tweets['tweet'].map(lambda x: sentiment_score(x))

In [None]:
sent_score = tweets.vader.mean()

In [None]:
print(f"Between {s_date} and {e_date}, the sentiment score for {key_word} was on average {round(sent_score, 3)}.")

In [None]:
tweets.head()

## Financial Data

In [None]:
## implement function to get ticker symbol from company name

In [None]:
msft.quarterly_rev()

In [None]:
"0x1a8E53C684f38E1AC640f3f510B0CbA3aFd3EE70" == "0x1a8E53C684f38E1AC640f3f510B0CbA3aFd3EE70"

In [None]:
GridSearchCV()