# Observational study 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from networkx import nx
from plotnine import ggplot, aes, geom_bar
from functions import *
import spacy
import time
import datetime
import statsmodels.formula.api as smf
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category= SettingWithCopyWarning)

In [2]:
from popularity import trending_history
from financials import *

In [3]:
elon_org_df = pd.read_csv('Data/org-lg-Elon Musk.csv.bz2')

## Matching : 

In [None]:
def compareCompanies(companies, tickers, date, shares, hist):
    # Returns a df with the following columns: ['Date', 'Close', 'Volume', 'MarketCap', 'Popularity']
    df = pd.DataFrame(companies, columns=['Name'])
    df['Date'] = date
    
    marketData = []

    for i in range(len(tickers)):
        marketData.append(obs_info(tickers[i], date, shares[i], hist[i]).values.tolist())
    
    df['Close'] = [i[0] for i in marketData]
    df['Volume'] = [i[1] for i in marketData]
    df['MarketCap'] = [i[2] for i in marketData]
    
    df_trend = trending_history(companies, date)
    df_trend_list = df_trend.values.tolist()[0]
    df['Popularity'] = df_trend_list
    
    return df

def obs_info(ticker, date, sharesOutstanding, comp_hist):
    for i in range(len(comp_hist)):
        if comp_hist.iloc[i].name.date() > datetime.date.fromisoformat(date):
            df = comp_hist.iloc[i]
            break
    df['MarketCap'] = int (sharesOutstanding or 0)*df['Close']
    
    return df

In [10]:
#Data is our data frame with all the companies with first column 1 or 0 if Elon Musk talks about it.
# after,Name of the company and the other four covariates 
def compare(company_quote, date, companies, tickers, shares, hist): 

    company = compareCompanies(companies, tickers, date, shares, hist)

    company ['Elon']= 0
    company ['compare'] = company_quote
    company.at[4,'Elon'] = 1
    company.at[4,'compare'] = 'None'
    return company
    
#Call this function for all our functions and a set of company that will represent the controled one.

In [11]:
def create_data(companies_list):
    Company = companies_list
    company_quotes = Company [-1]
    quotes = elon_org_df[elon_org_df['ORG'] == company_quotes]
    quotes = quotes.sort_values('date', axis = 0)
    quotes = quotes['date']
    q = []
    for i in range(len(quotes)):
        q.append(quotes.iloc[i][0:10])
    quotes = pd.Series(q).unique()
    first_date = quotes[0]
    ticker =[ticker_of_company(Company[0]),
             ticker_of_company(Company[1]),
             ticker_of_company(Company[2]),
             ticker_of_company(Company[3]),
             ticker_of_company(Company[4])] 
    shares=[yf.Ticker(ticker[0]).info['sharesOutstanding'],
            yf.Ticker(ticker[1]).info['sharesOutstanding'],
            yf.Ticker(ticker[2]).info['sharesOutstanding'],
            yf.Ticker(ticker[3]).info['sharesOutstanding'],
            yf.Ticker(ticker[4]).info['sharesOutstanding']]
    hist =[yf.Ticker(ticker[0]).history(start=first_date)[['Close', 'Volume']],
           yf.Ticker(ticker[1]).history(start=first_date)[['Close', 'Volume']],
           yf.Ticker(ticker[2]).history(start=first_date)[['Close', 'Volume']],
           yf.Ticker(ticker[3]).history(start=first_date)[['Close', 'Volume']],
           yf.Ticker(ticker[4]).history(start=first_date)[['Close', 'Volume']]]
    data = pd.DataFrame(columns = ['Name','Date','Close','Volume','MarketCap','Popularity','Elon','compare'])
    for i in range (len(quotes)):
        date = quotes [i] 
        company = compare(company_quotes, date, Company, ticker, shares, hist)
        frame = [data, company]
        data = pd.concat(frame)
    data['Money Volume'] = data['Volume']*data['Close']
    return data

In [7]:
#Apple: 
data_Apple = create_data(['Microsoft','IBM','Samsung','Dell','Apple'] )
print (data_Apple.shape)

(220, 9)


In [16]:
#PayPal : 
data_PayPal = create_data(['Western Union','EuroNet','MoneyGram','Payoneer','PayPal'])

402007008
15.266165733337402
52856000
56.5
91688400
8.489999771118164
339008000
9.680000305175781
1174930048
36.709999084472656
402007008
15.05325984954834
52856000
64.30999755859375
91688400
8.890000343322754
339008000
9.680000305175781
1174930048
37.5099983215332
402007008
15.325908660888672
52856000
77.86000061035156
91688400
9.359999656677246
339008000
9.680000305175781
1174930048
32.630001068115234
402007008
14.601903915405273
52856000
73.36000061035156
91688400
7.860000133514404
339008000
9.680000305175781
1174930048
36.31999969482422
402007008
16.640453338623047
52856000
72.5199966430664
91688400
7.539999961853027
339008000
9.680000305175781
1174930048
37.41999816894531
402007008
17.248245239257812
52856000
77.25
91688400
6.849999904632568
339008000
9.680000305175781
1174930048
38.099998474121094
402007008
17.584997177124023
52856000
77.8499984741211
91688400
7.329999923706055
339008000
9.680000305175781
1174930048
38.11000061035156
402007008
16.137189865112305
52856000
90.73000

In [4]:
data = pd.read_csv('Data/FinalFeatures.csv.bz2')
data

Unnamed: 0,Name,Date,Close,Volume,MarketCap,Popularity,Elon,compare,Money Volume
0,Microsoft,2015-02-05,37.501652,34616600.0,2.824463e+11,31,0,Apple,1.298180e+09
1,IBM,2015-02-05,108.493607,3406299.0,9.694153e+10,4,0,Apple,3.695617e+08
2,Samsung,2015-02-05,266.104340,8005050.0,1.588584e+12,100,0,Apple,2.130179e+09
3,Dell,2015-02-05,12.067676,271519.0,3.349842e+09,15,0,Apple,3.276603e+06
4,Apple,2015-02-05,27.105112,174826400.0,4.523193e+11,40,1,,4.738689e+09
...,...,...,...,...,...,...,...,...,...
4665,Rivian,2020-04-16,100.730003,103679500.0,8.898488e+10,1,0,Tesla,1.044364e+10
4666,Lucid,2020-04-16,9.890000,564300.0,1.628260e+10,9,0,Tesla,5.580927e+06
4667,VolksWagen,2020-04-16,117.395164,2322531.0,2.420747e+10,61,0,Tesla,2.726539e+08
4668,General Motors,2020-04-16,22.480000,17166300.0,3.263781e+10,2,0,Tesla,3.858984e+08


In [5]:
#Propensity score : 
def add_propensityscore (data):#so that if we want to change only the first line don't have to do it
    #With a column of one and zero if elon musk talked about you 
    #company = company_quote.extend(controled)
    #data = compareCompanies(company,date)
    #Normalise the features 
    data['Close'] = (data['Close'] - data['Close'].mean())/data['Close'].std()
    data['Volume']= (data['Volume']-data['Volume'].mean())/data['Volume'].std()
    data['Money Volume']= (data['Money Volume']-data['Money Volume'].mean())/data['Money Volume'].std()
    data['MarketCap']= (data['MarketCap']-data['MarketCap'].mean())/data['MarketCap'].std()
    data['Popularity']= (data['Popularity']-data['Popularity'].mean())/data['Popularity'].std()
    #Create the model
    mod = smf.logit(formula='Elon ~ Close* Volume + MarketCap + Popularity', data=data)
    res = mod.fit()
    data['Propensity_score'] = res.predict()

In [6]:
add_propensityscore(data)

Optimization terminated successfully.
         Current function value: 0.113723
         Iterations 14




In [7]:
data[data['Name']== 'Microsoft']

Unnamed: 0,Name,Date,Close,Volume,MarketCap,Popularity,Elon,compare,Money Volume,Propensity_score
0,Microsoft,2015-02-05,-0.072314,0.083692,-0.034855,-0.147734,0,Apple,-0.059075,0.508522
5,Microsoft,2015-02-06,-0.072336,0.008014,-0.034883,-0.218893,0,Apple,-0.061663,0.346191
10,Microsoft,2015-02-09,-0.07223,-0.031992,-0.03475,-0.218893,0,Apple,-0.062881,0.314703
15,Microsoft,2015-02-10,-0.072327,0.168968,-0.034872,-0.195174,0,Apple,-0.056213,0.500415
20,Microsoft,2015-02-12,-0.071672,0.215804,-0.034045,-0.195174,0,Apple,-0.053512,0.506547
25,Microsoft,2015-02-21,-0.07141,0.034625,-0.033715,-0.384933,0,Apple,-0.059488,0.146423
30,Microsoft,2015-02-24,-0.071481,-0.029908,-0.033804,-0.218893,0,Apple,-0.061865,0.285881
35,Microsoft,2015-03-12,-0.072638,0.630803,-0.035263,-0.195174,0,Apple,-0.041371,0.811892
40,Microsoft,2015-05-06,-0.07028,0.045218,-0.032289,-0.124014,0,Apple,-0.057534,0.420971
45,Microsoft,2015-05-07,-0.069815,0.101194,-0.031702,-0.147734,0,Apple,-0.054737,0.40688


In [8]:
def get_similarity(propensity_score1, propensity_score2):
    '''Calculate similarity for instances with given propensity scores'''
    return 1-np.abs(propensity_score1-propensity_score2)

In [9]:
def matchingfunction(companies):
    treatment_df = data[data['Elon'] == 1]
    control_df   = data[data['Elon'] == 0]
    # Create an empty undirected graph
    G = nx.Graph()
    # Loop through all the pairs of instances
    for control_id, control_row in control_df.iterrows():
        for treatment_id, treatment_row in treatment_df.iterrows():
            if (control_row['compare']== treatment_row['Name']) and (control_row['Date']== treatment_row['Date']) :
                # Calculate the similarity 
                similarity = get_similarity(control_row['Propensity_score'],
                                    treatment_row['Propensity_score'])
                # Add an edge between the two instances weighted by the similarity between them
                G.add_weighted_edges_from([(control_id, treatment_id, similarity)])
    # Generate and return the maximum weight matching on the generated graph
    matching = nx.max_weight_matching(G)
    return matching

In [10]:
matching = matchingfunction(data)
matched = [i[0] for i in list(matching)]+ [i[1]for i in list(matching)]
balanced = data.iloc[matched]

## After matching : 


In [11]:
matching 

{(0, 4),
 (9, 5),
 (10, 14),
 (19, 15),
 (24, 20),
 (25, 29),
 (34, 30),
 (35, 39),
 (40, 44),
 (49, 45),
 (50, 54),
 (59, 55),
 (64, 60),
 (65, 69),
 (74, 70),
 (75, 79),
 (83, 84),
 (89, 85),
 (90, 94),
 (99, 95),
 (104, 100),
 (108, 109),
 (114, 110),
 (115, 119),
 (120, 124),
 (129, 125),
 (133, 134),
 (138, 139),
 (144, 143),
 (148, 149),
 (153, 154),
 (158, 159),
 (163, 164),
 (168, 169),
 (173, 174),
 (178, 179),
 (184, 183),
 (188, 189),
 (193, 194),
 (198, 199),
 (203, 204),
 (208, 209),
 (213, 214),
 (219, 215),
 (224, 221),
 (226, 229),
 (232, 234),
 (237, 239),
 (243, 244),
 (249, 246),
 (251, 254),
 (256, 259),
 (264, 261),
 (266, 269),
 (274, 271),
 (276, 279),
 (280, 284),
 (288, 289),
 (291, 294),
 (296, 299),
 (304, 301),
 (306, 309),
 (314, 311),
 (317, 319),
 (323, 324),
 (329, 325),
 (331, 334),
 (336, 339),
 (344, 341),
 (345, 349),
 (354, 350),
 (356, 359),
 (362, 364),
 (369, 366),
 (371, 374),
 (378, 379),
 (384, 383),
 (388, 389),
 (393, 394),
 (398, 399),
 (40