In [1]:
#import packages
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import stats
import statsmodels.api as sm
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
filename = '~/galvanize/capstones/PredictingSearches-on-EDGAR/Logs/111001.csv'

In [3]:
#Cleaning the data
def file_to_df(filename):
    return pd.read_csv(filename, names=['IP','Date','Time','Accession'])

In [4]:
df = file_to_df(filename)

In [5]:
def find_unique_docs(df,number):
    Uniq = df.Accession.unique()
    u = np.random.choice(Uniq,number)
    return df[df['Accession'].isin(u)]

In [6]:
df_uniq = find_unique_docs(df,100000)

In [7]:
print(df.shape, df_uniq.shape)

(2113977, 4) (245489, 4)


In [8]:
# df is the whole thing df_uniq is the sample

In [9]:
def cleaning_df(df):
    df = df.reset_index()
    df['Datetime'] = df['Date'] + ' ' + df['Time']
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    df = df.drop(['index','IP','Date','Time'],axis=1)
    df.set_index(df.Datetime, inplace=True)
    df = df.drop('Datetime',axis=1)
    df['Counts'] = df.groupby(df.Accession).cumcount()+1
    return df

In [10]:
df_cleaned = cleaning_df(df_uniq)

In [11]:
df_cleaned.shape

(245489, 2)

In [12]:
def find_popular_docs(df,threshold=.001):
    df_sorted = df.sort_values('Counts').drop_duplicates('Accession',keep='last')
    df_sorted['Popular'] = np.where((df_sorted['Counts']/df_sorted['Counts'].cumsum().max()) < threshold,0,1)
    return df_sorted

In [13]:
pop_docs = find_popular_docs(df_cleaned)

In [14]:
pop_docs.tail(20)

Unnamed: 0_level_0,Accession,Counts,Popular
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-10-01 23:44:09,0000950123-11-025579,439,1
2011-10-01 23:23:55,0001193125-09-029055,442,1
2011-10-01 23:39:19,0000894189-06-000390,485,1
2011-10-01 23:56:30,0001193125-11-260021,553,1
2011-10-01 23:53:48,0001193125-11-032930,566,1
2011-10-01 23:56:22,0000950123-11-087983,609,1
2011-10-01 23:35:18,0001193125-11-046701,613,1
2011-10-01 23:05:27,0000950142-11-001673,641,1
2011-10-01 23:56:59,0000078239-11-000061,655,1
2011-10-01 23:59:32,0001145443-11-000999,883,1


In [15]:
## Apply Classification Column to Whole DF

In [16]:
def add_classifier_X(df,df_sorted):
    df = df.reset_index()
    df_sorted = df_sorted.reset_index()
    X = pd.merge(df, df_sorted[['Accession','Popular']], on='Accession',how='left')
    return X.set_index('Datetime')

In [17]:
df_pop = add_classifier_X(df_cleaned, pop_docs)

In [18]:
df_pop.shape

(245489, 3)

In [19]:
df_pop.head()

Unnamed: 0_level_0,Accession,Counts,Popular
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-10-01,0001494647-11-000029,1,0
2011-10-01,0001494647-11-000081,1,0
2011-10-01,0000905729-11-000288,1,0
2011-10-01,0001437749-11-007171,1,0
2011-10-01,0001145549-11-008655,1,0


In [20]:
## GETTING PRIOR TO 10 AM

In [21]:
def df_beforen(df,threshold=.001,stoptime='2011-10-01 10:00:00'):
    stop = pd.Timestamp(stoptime)
    df = df[:stop]
    df.reset_index()
    df['Popular_BeforeN'] = np.where((df['Counts']/df['Counts'].cumsum().max()) < threshold,0,1)
    return df

In [22]:
stoptime='2011-10-01 10:00:00'
stop = pd.Timestamp(stoptime)
df_popb = df_pop[:stop]

In [23]:
df_popb = df_popb.reset_index()

In [24]:
threshold = .001
df_popb['Counts_B'] = df_popb.groupby(df_popb.Accession).cumcount()+1
df_popb['Popular_BeforeN'] = np.where((df_popb['Counts_B']/df_popb['Counts_B'].cumsum().max()) < threshold,0,1)

In [25]:
df_popb.shape

(111045, 6)

In [26]:
df_popb.head()

Unnamed: 0,Datetime,Accession,Counts,Popular,Counts_B,Popular_BeforeN
0,2011-10-01,0001494647-11-000029,1,0,1,0
1,2011-10-01,0001494647-11-000081,1,0,1,0
2,2011-10-01,0000905729-11-000288,1,0,1,0
3,2011-10-01,0001437749-11-007171,1,0,1,0
4,2011-10-01,0001145549-11-008655,1,0,1,0


In [27]:
 #Applying Classifier before 10 classifier
def add_classifier_B(df,df_B):
    df = df.reset_index()
    df_B = df_B.reset_index()
    X = pd.merge(df, df_B[['Accession','Popular_BeforeN']], on='Accession',how='left')
    return X.set_index('Datetime')

In [28]:
df_full = add_classifier_B(df_pop,df_popb)

In [29]:
df_full.shape

(133902933, 4)

In [30]:
df5 = df_full.drop_duplicates()

In [32]:
df5.shape

(245489, 4)

In [33]:
df5 = df5.reset_index()

In [34]:
df5.head()

Unnamed: 0,Datetime,Accession,Counts,Popular,Popular_BeforeN
0,2011-10-01,0001494647-11-000029,1,0,0.0
1,2011-10-01,0001494647-11-000081,1,0,0.0
2,2011-10-01,0000905729-11-000288,1,0,0.0
3,2011-10-01,0001437749-11-007171,1,0,0.0
4,2011-10-01,0001145549-11-008655,1,0,0.0


In [35]:
df5.Popular_BeforeN = df5.Popular_BeforeN.fillna(0)

In [36]:
df5.Popular_BeforeN = df5.Popular_BeforeN.astype('int')

In [37]:
df5.head()

Unnamed: 0,Datetime,Accession,Counts,Popular,Popular_BeforeN
0,2011-10-01,0001494647-11-000029,1,0,0
1,2011-10-01,0001494647-11-000081,1,0,0
2,2011-10-01,0000905729-11-000288,1,0,0
3,2011-10-01,0001437749-11-007171,1,0,0
4,2011-10-01,0001145549-11-008655,1,0,0


In [38]:
X_class = df5

In [39]:
docnames = X_class.pop('Accession').values
dates = X_class.pop('Datetime').values
counts = X_class.pop('Counts').values
y_class = X_class.pop('Popular').values

In [None]:
#Resampling for Undersample

from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_resample(X_class, y_class)
print(sorted(Counter(y_resampled).items()))

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=0)

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

predictions = logmodel.predict(X_test)

ValueError: Expected 2D array, got 1D array instead:
array=[0. 0. 0. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [88]:
from sklearn.metrics import classification_report

print(classification_report(y_test,predictions))
print("Accuracy:", accuracy_score(y_test, predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00     16197
           1       1.00      1.00      1.00   5526723

    accuracy                           1.00   5542920
   macro avg       0.50      0.50      0.50   5542920
weighted avg       0.99      1.00      1.00   5542920

Accuracy: 0.9970778939620273
