In [5]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing

import matplotlib.pyplot as plt
plt.rc("font", size=14)
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 50)
%matplotlib inline
plt.style.use('ggplot')

In [6]:
data = pd.read_csv('EPSFINAL.csv')
data = data.dropna()
print(data.shape)
print(list(data.columns))

(491, 41)
['Symbol', 'Name', 'Date Public', 'Founded', 'MarketValue', 'PQ1EarningsDate', 'PQ1Variance', '-10 Before Earnings', '+10 Since Earnings', '+20 Since Earnings', '+30 Since Earnings', 'P/Q-2 Earnings Date', 'P/Q-2Variance', '-10 Before Earnings.1', '+10 Since Earnings.1', '+20 Since Earnings.1', '+30 Since Earnings.1', 'P/Q-3 Earnings Date', 'P/Q-3Variance', '-10 Before Earnings.2', '+10 Since Earnings.2', '+20 Since Earnings.2', '+30 Since Earnings.2', 'P/Q-4 Earnings Date', 'P/Q-4 % Variance', '-10 Before Earnings.3', '+10 Since Earnings.3', '+20 Since Earnings.3', '+30 Since Earnings.3', 'P/Q-5 Earnings Date', 'P/Q-5 % Variance', '-10 Before Earnings.4', '+10 Since Earnings.4', '+20 Since Earnings.4', '+30 Since Earnings.4', 'P/Q-6 Earnings Date', 'P/Q-6 % Variance', '-10 Before Earnings.5', '+10 Since Earnings.5', '+20 Since Earnings.5', '+30 Since Earnings.5']


In [7]:
data.head(5)

Unnamed: 0,Symbol,Name,Date Public,Founded,MarketValue,PQ1EarningsDate,PQ1Variance,-10 Before Earnings,+10 Since Earnings,+20 Since Earnings,+30 Since Earnings,P/Q-2 Earnings Date,P/Q-2Variance,-10 Before Earnings.1,+10 Since Earnings.1,+20 Since Earnings.1,+30 Since Earnings.1,P/Q-3 Earnings Date,P/Q-3Variance,-10 Before Earnings.2,+10 Since Earnings.2,+20 Since Earnings.2,+30 Since Earnings.2,P/Q-4 Earnings Date,P/Q-4 % Variance,-10 Before Earnings.3,+10 Since Earnings.3,+20 Since Earnings.3,+30 Since Earnings.3,P/Q-5 Earnings Date,P/Q-5 % Variance,-10 Before Earnings.4,+10 Since Earnings.4,+20 Since Earnings.4,+30 Since Earnings.4,P/Q-6 Earnings Date,P/Q-6 % Variance,-10 Before Earnings.5,+10 Since Earnings.5,+20 Since Earnings.5,+30 Since Earnings.5
0,AAPL,Apple Inc.,11/05/1984,1976.0,1216252.0,1/29/2020,10%,0.55%,-2.42%,-4.58%,7.36%,10/31/2019,7%,1.87%,3.53%,0.22%,2.00%,7/31/2019,4%,1.78%,-0.28%,-0.39%,4.25%,5/1/2019,4%,-0.67%,-7.00%,-4.49%,5.11%,1/3/2019,-10%,-1.64%,1.94%,4.14%,-0.36%,11/2/2018,5%,3.92%,-7.40%,-6.54%,-1.47%
1,MSFT,Microsoft Corporation,3/13/1986,1975.0,1208069.0,1/30/2020,15%,3.90%,3.46%,0.29%,7.13%,10/24/2019,11%,-3.68%,0.49%,2.94%,0.04%,7/19/2019,13%,-0.77%,1.74%,0.73%,0.02%,4/25/2019,14%,3.09%,-0.98%,2.12%,2.18%,1/31/2019,1%,-1.47%,0.72%,3.07%,2.69%,10/25/2018,18%,1.05%,-0.67%,-1.62%,4.07%
2,AMZN,"Amazon.com, Inc.",5/15/1997,1994.0,888591.6,1/31/2020,60%,0.62%,1.35%,-0.03%,-8.65%,10/25/2019,-8%,1.09%,-1.04%,-2.90%,-0.72%,7/26/2019,-6%,-1.59%,-3.53%,-0.88%,-0.02%,4/26/2019,51%,1.67%,-1.30%,-1.68%,-0.14%,2/1/2019,7%,-1.35%,-3.83%,4.75%,1.66%,10/26/2018,86%,4.42%,-0.45%,-3.96%,5.17%
3,GOOGL,Alphabet Inc. Class A,8/19/2004,2015.0,836412.7,2/4/2020,23%,2.57%,2.68%,-1.86%,-12.11%,10/29/2019,-18%,3.36%,0.99%,-0.43%,2.27%,7/26/2019,28%,-0.94%,-1.08%,-0.65%,-0.21%,4/30/2019,-10%,4.73%,-2.57%,1.37%,-7.28%,2/5/2019,18%,1.02%,-4.66%,4.35%,3.76%,10/26/2018,25%,1.95%,-5.31%,1.74%,1.80%
4,FB,"Facebook, Inc. Class A",5/18/2012,2004.0,485374.8,1/30/2020,1%,2.16%,-1.14%,2.41%,-2.44%,10/31/2019,11%,-2.61%,-1.27%,2.86%,-4.69%,7/25/2019,-52%,0.00%,-3.13%,-3.84%,1.05%,4/25/2019,-47%,1.10%,-0.54%,-2.55%,-6.11%,1/31/2019,9%,-1.79%,-3.29%,-3.22%,1.96%,10/31/2018,20%,-3.41%,-4.83%,-5.24%,8.81%


In [8]:
data.columns.to_series().groupby(data.dtypes).groups

{dtype('float64'): Index(['Founded'], dtype='object'),
 dtype('O'): Index(['Symbol', 'Name', 'Date Public', 'MarketValue', 'PQ1EarningsDate',
        'PQ1Variance', '-10 Before Earnings', '+10 Since Earnings',
        '+20 Since Earnings', '+30 Since Earnings', 'P/Q-2 Earnings Date',
        'P/Q-2Variance', '-10 Before Earnings.1', '+10 Since Earnings.1',
        '+20 Since Earnings.1', '+30 Since Earnings.1', 'P/Q-3 Earnings Date',
        'P/Q-3Variance', '-10 Before Earnings.2', '+10 Since Earnings.2',
        '+20 Since Earnings.2', '+30 Since Earnings.2', 'P/Q-4 Earnings Date',
        'P/Q-4 % Variance', '-10 Before Earnings.3', '+10 Since Earnings.3',
        '+20 Since Earnings.3', '+30 Since Earnings.3', 'P/Q-5 Earnings Date',
        'P/Q-5 % Variance', '-10 Before Earnings.4', '+10 Since Earnings.4',
        '+20 Since Earnings.4', '+30 Since Earnings.4', 'P/Q-6 Earnings Date',
        'P/Q-6 % Variance', '-10 Before Earnings.5', '+10 Since Earnings.5',
        '+20 Since E

In [9]:
data.dtypes

Symbol                    object
Name                      object
Date Public               object
Founded                  float64
MarketValue               object
                          ...   
P/Q-6 % Variance          object
-10 Before Earnings.5     object
+10 Since Earnings.5      object
+20 Since Earnings.5      object
+30 Since Earnings.5      object
Length: 41, dtype: object

## Data Transformations: Date to Age

In [10]:
df = data[['Name','Date Public','MarketValue','PQ1Variance','+10 Since Earnings','+20 Since Earnings','+30 Since Earnings']]

In [11]:
import datetime

In [12]:
todaydate = datetime.datetime.today()

## Create New Column in DF 

In [13]:
df['Age'] = (todaydate - pd.to_datetime(df['Date Public'])).astype('<m8[Y]')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
df.head(5)

Unnamed: 0,Name,Date Public,MarketValue,PQ1Variance,+10 Since Earnings,+20 Since Earnings,+30 Since Earnings,Age
0,Apple Inc.,11/05/1984,1216252.0,10%,-2.42%,-4.58%,7.36%,35.0
1,Microsoft Corporation,3/13/1986,1208069.0,15%,3.46%,0.29%,7.13%,34.0
2,"Amazon.com, Inc.",5/15/1997,888591.6,60%,1.35%,-0.03%,-8.65%,22.0
3,Alphabet Inc. Class A,8/19/2004,836412.7,23%,2.68%,-1.86%,-12.11%,15.0
4,"Facebook, Inc. Class A",5/18/2012,485374.8,1%,-1.14%,2.41%,-2.44%,7.0


In [15]:
## CAN WE CREATE MULTIPLE CATEGORIES OFF OF VARIANCE? Like high and low? 

In [None]:
df.dtypes

## Convert Objects to Floats & Integers 

In [None]:
def convertMarketValue(valueInString):
    return float(valueInString.replace(",",""))

In [None]:
df['MarketValue'] = df['MarketValue'].apply(convertMarketValue)

In [None]:
df['PQ1Variance'] = df['PQ1Variance'].apply(lambda x: np.nan if x in ['-'] else x[:-1]).astype(float)/100

In [None]:
df['+10 Since Earnings'] = df['+10 Since Earnings'].apply(lambda x: np.nan if x in ['-'] else x[:-1]).astype(float)/100

In [None]:
df['+20 Since Earnings'] = df['+20 Since Earnings'].apply(lambda x: np.nan if x in ['-'] else x[:-1]).astype(float)/100

In [None]:
df['+30 Since Earnings'] = df['+30 Since Earnings'].apply(lambda x: np.nan if x in ['-'] else x[:-1]).astype(float)/100

In [None]:
criteria = [df['Age']<=10, df['Age'].between(10,20,inclusive=False), df['Age']>=20]
values = [1,2,3]
df['AGECAT'] = np.select(criteria,values)

In [None]:
criteria = [df['MarketValue']<=10222,df['MarketValue'].between(10222,43868,inclusive=False),df['MarketValue']>=43868.07]
values = [1,2,3]
df['MRKTVCAT'] = np.select(criteria,values)

In [None]:
criteria = [df['PQ1Variance']<=0, df['PQ1Variance']>=0]
values = [0,1]
df['ExceedForecast'] = np.select(criteria,values)

In [None]:
criteria = [df['+10 Since Earnings']<=0, df['+10 Since Earnings']>=0]
values = [0,1]
df['y'] = np.select(criteria,values)

In [None]:
criteria = [df['+20 Since Earnings']<=0, df['+20 Since Earnings']>=0]
values = [0,1]
df['+20Gain/Loss'] = np.select(criteria,values)

In [None]:
criteria = [df['+30 Since Earnings']<=0, df['+30 Since Earnings']>=0]
values = [0,1]
df['+30Gain/Loss'] = np.select(criteria,values)

In [None]:
df.head(5)

In [None]:
df.describe()

## Ensure Data Types APPROPRIATE

In [None]:
df.dtypes

In [None]:
df['y'].value_counts()

In [None]:
count_no_sub = len(df[df['y']==0])
count_sub = len(df[df['y']==1])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of Companies that Lost SV", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of companies that gained SV", pct_of_sub*100)


In [None]:
sns.countplot(x='y',data=df, palette='hls')
plt.show()

In [None]:
## I want Y to be +10 Earnings: "Yes, it increased in value based off of X"

In [None]:
## Train and test seperate 

In [None]:
%matplotlib inline
pd.crosstab(df.MRKTVCAT,df.y).plot(kind='bar')
plt.title('Drift by Mrkt Cap Size')
plt.xlabel('Mrkt Cap Size')
plt.ylabel('Frequency of Drift')

In [None]:
%matplotlib inline
pd.crosstab(df.AGECAT,df.y).plot(kind='bar')
plt.title('Drift by Mrkt Cap Size')
plt.xlabel('AGE CATEGORY')
plt.ylabel('Frequency of Drift')

In [None]:
%matplotlib inline
pd.crosstab(df.ExceedForecast,df.y).plot(kind='bar')
plt.title('Drift: ExceedForecast')
plt.xlabel('')
plt.ylabel('Frequency of Drift')

In [None]:
##SHOULD WE CONCATENATE TO CREATE A SUPER VARIABLE? 


In [None]:
X = df.loc[:, df.columns != 'y']
y = df.loc[:, df.columns == 'y']

In [None]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

In [None]:
## Why Cant I create a new data frame with just the dummy variables? 

In [None]:
df = data[['Age','Name','MRKTVCAT','ExceedForecast','y']]