In [None]:
import twint
import nest_asyncio
import pandas as pd
nest_asyncio.apply()

# pd.set_option("display.max_columns", 999)

# NLTK VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import numpy as np

import yfinance as yf

import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

import scipy
from sklearn import preprocessing

# granger causality analysis 
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests

# source: https://github.com/twintproject/twint

In [None]:
# Configure
c = twint.Config()

# tesla & TSLA | &zm | $ual | $siri | $pcar | $nvda | $wba | $sbux
c.Search = "tsla"
# c.Since = '2021-01-01'
# c.Until = '2021-01-03'

# c.Debug = True
# c.Limit = 10

# problematic because most people do not show where their geolocation is --> data loss
# c.Near = "NYC"
# c.Geo="40.730610,-73.935242,500km" 

c.Language = "en"


# filter verified: in order to solve the data abundance problem (too much data -> too much time -> a lot of noise in the data) I turned on verification. This leads to a skew towards newspaper and "famous" people. 
c.Verified = False

## filter popular tweets - 1.Option: min_likes, min_retweets, min_replies - 2.Option: popular-tweets (pt)
# c.Min_likes = 10
# c.Min_retweets = 10
# problem with 10 -> 12 -> 9
# minimum replies eliminates most of the noise from the data, since usually bots or scammers do not comment much - it's easier to like or retweet 
# c.Min_replies = 5

# not working!
# c.Pt = True

# not working!
# c.Filter_retweets = False



# c.Stats = True


c.Store_json = True
# c.Custom_json = ["id", "user_id", "created_at", "date", "tweet", "timezone", "hashtags"]
c.Output = "/Users/pietj.ginski/Desktop/BWL-Studium/BWL 6 Semester/Bachelor Thesis/Raw Data BT/archive-2/test.json"


# Run - this is commmented out!!
twint.run.Search(c)

In [None]:
# *** NLP Function ***

def nlp(data):

# create the analyzer 
    analyzer = SentimentIntensityAnalyzer()

# prepare the data
    sentences = np.array(data["tweet"])

# filter out retweets - we need individual thoughts without any "pre influence"
    data = data[data['retweet']== False]

# get the vader scores 
    data['vader_scores'] = data["tweet"].apply(lambda sentences: analyzer.polarity_scores(sentences))

# append vader_scores 
    data['compound'] = data['vader_scores'].apply(lambda score_dict: score_dict['compound'])
    
# create delta_compound 
    data["delta_compound"] = (data["compound"] / data["compound"].shift(1)) - 1

    return data

In [None]:
#  *** Engagement Score Function*** 

def engagement_score(df):
# creating a sensible engagement score through normalization
    x = df[['replies_count','retweets_count','likes_count']].values 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_2 = pd.DataFrame(x_scaled)
    engagement_score = df_2[0]+df_2[1]+df_2[2]

# append engagement_score & compound_engagement_score
    df['engagement_score'] = engagement_score
    df['compound_engagement_score'] = df['engagement_score']*df['compound']
    
    return df


In [None]:
# *** Stock Data Function ***

def yahoo(ticker, start_date, end_date, df_name):

# define the ticker symbol
    tickerSymbol = ticker

# get data on this ticker
    tickerData = yf.Ticker(tickerSymbol)

# get the historical prices for this ticker
    df_name = tickerData.history(period='1d', start=start_date, end=end_date)

# get daily returns
    df_name["daily_returns"] = (df_name["Close"] / df_name["Close"].shift(1)) - 1

# convert index to date_s colums
    df_name['date_s'] = pd.to_datetime(df_name.index)
    
    return df_name

In [None]:
# *** Group Function ***

def group(data, stock_data):

    data_g = data.groupby(pd.to_datetime(data['date']).dt.date).mean()

# convert to datetime
    data_g['date_s'] = pd.to_datetime(data_g.index)

# # merge the two datasets
    data_g_merge = pd.merge(data_g, stock_data, on='date_s', how='inner')
    
    return data_g_merge
    

In [None]:
# *** Correlation Analysis Function ***

def correlation(data,column_1,column_2):

    corr, p_value = scipy.stats.pearsonr(data[column_1][1:], data[column_2][1:])

    print('Correlation Coefficient:',corr.round(3))
    print('P-Value:',p_value.round(3))

In [None]:
# *** Lag Function *** 

def lag(df):

# apple delta return (has to be one day ahead )
    ap_dr = df["Close"].shift(2)[3:]
# apple delta compound 
    ap_dc = df["compound"][3:]


# correlation analysis
    corr, p_value = scipy.stats.pearsonr(ap_dc, ap_dr)

    print('Correlation Coefficient:',corr.round(3))
    print('P-Value:',p_value.round(3))

In [None]:
# *** Granger Causality Function ***

def granger(data, lag):

    data_2 = data[["Close", "compound"]].pct_change().dropna()
    data_2.replace([np.inf, -np.inf], np.nan, inplace = True)
    data_2 = data_2.dropna()
# # execute granger causality test
    gc_res = grangercausalitytests(data_2, lag)

# Microsoft - Example

### Data Import 

In [None]:
# Import Dataset 

# file name 
file_name = "twint_tesla_verified.json"
string = "/Users/pietj.ginski/Desktop/BWL-Studium/BWL 6 Semester/Bachelor Thesis/Raw Data BT/archive-2/rubbish/{}".format(file_name)
# import the json file 
raw_msft = pd.read_json(string, lines = True)

raw_msft.count()[1]

### Data Manipulation

In [None]:
# apply nlp()
nlp_msft = nlp(raw_msft)

# apply yahoo()
price_msft = yahoo('MSFT', '2020-06-01','2021-05-31', 'msft')

# apply group()
grouped_msft = group(nlp_msft, price_msft)

# apply engagement_score() 
grouped_msft = engagement_score(grouped_msft)