In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
import pickle
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

import altair as alt
alt.renderers.enable('notebook')

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from pandas.io.json import json_normalize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB


from imblearn.under_sampling import CondensedNearestNeighbour, RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN



In [2]:
df = pd.read_csv('../data/processed/final_book_df.csv', index_col=0)

In [3]:
df2 = pd.read_csv('../data/processed/reddit_df_with_dates.csv', index_col=0)

In [4]:
df2 = df2.drop(columns=['duck_dates', 'ask_dates'])
df = df.append(df2)

In [8]:
df = df.reset_index()
df = df.drop(columns=['index'])

In [6]:
df = df.append(df2)

In [9]:
df.describe()

Unnamed: 0,date,target
count,1294.0,1022.0
mean,1865.688563,2.529354
std,124.642119,1.738716
min,1511.0,0.0
25%,1819.0,1.0
50%,1907.0,2.0
75%,1959.0,4.0
max,2016.0,5.0


In [10]:
df = df.drop_duplicates(subset=['text'], keep='first', inplace=False)

In [11]:
df['date'] = df['date'].astype(int)

In [12]:
df.shape

(1097, 4)

In [13]:
df.isna().sum()

date        0
info        2
target    136
text        0
dtype: int64

In [14]:
df.head()

Unnamed: 0,date,info,target,text
0,1528,The book of the Courtier,0.0,then the soul freed from vice purged by studie...
1,1569,Hamlet,0.0,his goodly frame the earth seems to me a steri...
2,1592,the spanish tragedy,0.0,"O eyes, no eyes, but fountains fraught with te..."
3,1569,Hamlet,0.0,firmament this majestical roof fretted with go...
4,1623,macbeth,0.0,mine eyes are made the fools o the other sense...


# Clean Text

In [None]:
from sklearn.base import TransformerMixin

class LanguageTransformer(TransformerMixin):

    def fit(self, x_train):
        return self

    def transform(self, x_train):
        new_list = []
        new_line = []
        final_line = []
        final_entry = []
        for item in x_train:
            new_list.append(item + ', ')
            for list_item in new_list:
                new_line.append(list_item.split())
                for line in new_line:
                    final_line = []
                    for word in line:
                        lemmatizer = WordNetLemmatizer()
                        raw_text = str(word)
                        string_lower_case = raw_text.lower()
                        # new_text = string_lower_case.astype('U')
                        retokenizer = RegexpTokenizer(r'[a-z]+')
                        words = retokenizer.tokenize(string_lower_case)
                        lemm_words = lemmatizer.lemmatize(" ".join(words))
                        final_line.append(lemm_words)

                final_entry.append(final_line)


        return final_entry

In [None]:
ct = LanguageTransformer()
ct.fit_transform(df['text'])

In [15]:

def clean_text(raw_text):
    lemmatizer = WordNetLemmatizer()
    raw_text = str(raw_text)
    lower_case = raw_text.lower()
    retokenizer = RegexpTokenizer(r'[a-z]+')
    words = retokenizer.tokenize(lower_case)

    return(lemmatizer.lemmatize(" ".join(words)))

df['text'] = df['text'].apply(clean_text)

# Function to Grid Search Target Dates

In [16]:
## Two preliminary instantiated vectorizers to be used in GridSearch function

cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(1, 2),
                        strip_accents='unicode')

tvec = TfidfVectorizer(stop_words='english',
                        ngram_range=(1, 3),
                        encoding='utf-8')

In [17]:
bin_list = [
    [0, 1670, 1800, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1910, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1945, np.inf],
    [0, 1670, 1800, 1870, 1920, 1960, np.inf],
    [0, 1670, 1830, 1890, 1920, 1945, np.inf],
    [0, 1670, 1830, 1890, 1920, 1950, np.inf],
    [0, 1670, 1830, 1890, 1910, 1945, np.inf],
    [0, 1670, 1830, 1890, 1930, 1975, np.inf],
    [0, 1700, 1800, 1870, 1910, 1945, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf],
    [0, 1700, 1830, 1870, 1920, 1945, np.inf],
    [0, 1670, 1830, 1870, 1920, 1975, np.inf],
    [0, 1670, 1830, 1890, 1920, 1975, np.inf],
    [0, 1600, 1700, 1800, 1900, 1950, np.inf],
    [0, 1670, 1830, 1920, 1950, 1990, np.inf],
    [0, 1700, 1830, 1890, 1910, 1945, np.inf],
    [0, 1670, 1830, 1910, 1950, 1990, np.inf],
    [0, 1670, 1870, 1910, 1950, 1990, np.inf],
    [0, 1670, 1830, 1890, 1920, 1990, np.inf],
    [0, 1670, 1830, 1890, 1930, 1990, np.inf],
    [0, 1670, 1830, 1890, 1920, 1960, np.inf]
]

In [18]:
def make_targets(bin_list, model, vectorizer, df=df):
    
    '''
    Function to grid search and find the optimal target for time periods
    bin_list: several ways to classify the targets
    model: model to instantiate
    vectorizer: either cvec or tvec
    '''
    
    for b in bin_list:
        bins = b
        bin_names = range(0, 6)
        df['target'] = pd.cut(df['date'], bins, labels=bin_names)
        df.groupby('target').count()

        #train test split
        x = df['text']
        y = df['target']
        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42, shuffle=True, stratify=y)
        
        
        
        #vectorizing
        train_data = vectorizer.fit_transform(x_train.apply(lambda x: np.str_(x)))
        test_data = vectorizer.transform(x_test.apply(lambda x: np.str_(x)))
        
       
        
        #instantiating, fitting, and scoring the model
        model = model
        model.fit(train_data, y_train)
        score = model.score(test_data, y_test)
       
        print(f' Test Accuracy of Bin {bins}: {score}')

In [19]:
make_targets(bin_list, model=LogisticRegression(class_weight='balanced'), vectorizer=cvec)

 Test Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf]: 0.6909090909090909
 Test Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf]: 0.6727272727272727
 Test Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1945, inf]: 0.6681818181818182
 Test Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1945, inf]: 0.7045454545454546
 Test Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1960, inf]: 0.6818181818181818
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1945, inf]: 0.7090909090909091
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1950, inf]: 0.7045454545454546
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1910, 1945, inf]: 0.6818181818181818
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1930, 1975, inf]: 0.7363636363636363
 Test Accuracy of Bin [0, 1700, 1800, 1870, 1910, 1945, inf]: 0.6818181818181818
 Test Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf]: 0.6727272727272727
 Test Accuracy of Bin [0, 1700, 1830, 1870, 1920, 1945, inf]: 0.6363636363636364
 Test Accuracy of Bin [0, 16

In [20]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=cvec) 

 Test Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf]: 0.5454545454545454
 Test Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf]: 0.5454545454545454
 Test Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1945, inf]: 0.5181818181818182
 Test Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1945, inf]: 0.5272727272727272
 Test Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1960, inf]: 0.55
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1945, inf]: 0.5454545454545454
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1950, inf]: 0.5272727272727272
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1910, 1945, inf]: 0.55
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1930, 1975, inf]: 0.55
 Test Accuracy of Bin [0, 1700, 1800, 1870, 1910, 1945, inf]: 0.5545454545454546
 Test Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf]: 0.5318181818181819
 Test Accuracy of Bin [0, 1700, 1830, 1870, 1920, 1945, inf]: 0.5409090909090909
 Test Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1975, inf]: 0.509090

In [None]:
make_targets(bin_list, model=RandomForestClassifier(), vectorizer=tvec)

In [21]:
make_targets(bin_list, model=LogisticRegression(class_weight='balanced'), vectorizer=tvec)

 Test Accuracy of Bin [0, 1670, 1800, 1870, 1910, 1945, inf]: 0.6909090909090909
 Test Accuracy of Bin [0, 1670, 1830, 1870, 1910, 1945, inf]: 0.6909090909090909
 Test Accuracy of Bin [0, 1670, 1830, 1870, 1920, 1945, inf]: 0.6636363636363637
 Test Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1945, inf]: 0.7090909090909091
 Test Accuracy of Bin [0, 1670, 1800, 1870, 1920, 1960, inf]: 0.6772727272727272
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1945, inf]: 0.7
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1920, 1950, inf]: 0.7272727272727273
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1910, 1945, inf]: 0.6863636363636364
 Test Accuracy of Bin [0, 1670, 1830, 1890, 1930, 1975, inf]: 0.6727272727272727
 Test Accuracy of Bin [0, 1700, 1800, 1870, 1910, 1945, inf]: 0.6772727272727272
 Test Accuracy of Bin [0, 1700, 1830, 1890, 1910, 1945, inf]: 0.6681818181818182
 Test Accuracy of Bin [0, 1700, 1830, 1870, 1920, 1945, inf]: 0.6318181818181818
 Test Accuracy of Bin [0, 1670, 1830, 1870,

# Binning & Cleaning

In [22]:
bins = [0, 1670, 1830, 1890, 1920, 1950, np.inf] #Performed well on all
names = [0, 1, 2, 3, 4, 5]

df['target'] = pd.cut(df['date'], bins, labels=names)

df.groupby('target').count()

Unnamed: 0_level_0,date,info,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,149,148,149
1,163,163,163
2,196,196,196
3,148,148,148
4,172,172,172
5,269,268,269


# EDA

In [23]:
df.shape

(1097, 4)

In [24]:
df.head(2)

Unnamed: 0,date,info,target,text
0,1528,The book of the Courtier,0,then the soul freed from vice purged by studie...
1,1569,Hamlet,0,his goodly frame the earth seems to me a steri...


In [25]:
df.isna().sum()

date      0
info      2
target    0
text      0
dtype: int64

In [26]:
df.dtypes

date         int64
info        object
target    category
text        object
dtype: object

In [27]:
df.target = df.target.astype(int)

In [None]:
# df.to_csv('../data/cleaned/book_df.csv')

# Countvectorizer Unigrams

In [100]:
stop = stopwords.words('english')
stop.extend(['one', 'word', 'us', 'could', 'go', 'let', 'see', 'would', 'two', 'said', 'made', 'brutus', 'dryden']) 

In [101]:
cvec = CountVectorizer(stop_words=stop)
unigrams = pd.DataFrame(cvec.fit_transform(df['text']).todense(), columns=cvec.get_feature_names())
unigrams['target'] = df['target']

In [None]:
target0_top_uni = unigrams.groupby('target').sum().T.sort_values(by=0, ascending=False)[[0]].head(20)
target1_top_uni = unigrams.groupby('target').sum().T.sort_values(by=1, ascending=False)[[0]].head(20)
target2_top_uni = unigrams.groupby('target').sum().T.sort_values(by=2, ascending=False)[[0]].head(20)
target3_top_uni = unigrams.groupby('target').sum().T.sort_values(by=3, ascending=False)[[0]].head(20)
target4_top_uni = unigrams.groupby('target').sum().T.sort_values(by=4, ascending=False)[[0]].head(20)
target5_top_uni = unigrams.groupby('target').sum().T.sort_values(by=5, ascending=False)[[0]].head(20)

In [None]:
def clean_gram_df(uni_df):
    unigram_df = uni_df.rename(columns={0: 'frequency'})
    unigram_df = unigram_df.reset_index()
    unigram_df = unigram_df.rename(columns={'index': 'word'})
    return unigram_df

In [None]:
# using the clean_uni_df function to create individual dataframes by target

target0 = clean_gram_df(target0_top_uni)
target1 = clean_gram_df(target1_top_uni)
target2 = clean_gram_df(target2_top_uni)
target3 = clean_gram_df(target3_top_uni)
target4 = clean_gram_df(target4_top_uni)
target5 = clean_gram_df(target5_top_uni)

In [None]:
def altair_chart(target, color='darkred'):
    '''
    Input: target dataframe and color
    Output: altair chart
    '''
    
    Chart = alt.Chart(target).mark_bar(color=color).encode(
        x='word',
        y='frequency')
    
    return Chart

In [None]:
altair_chart(target0)

In [None]:
altair_chart(target1, color='blue')

In [None]:
altair_chart(target2, color='orange')

In [None]:
altair_chart(target3, color='black')

In [None]:
altair_chart(target4, color='yellow')

In [None]:
altair_chart(target5, color='purple')

# CountVectorizer for Bigrams

In [None]:
bivec = CountVectorizer(stop_words=stop, ngram_range=(2, 2))
bigrams = pd.DataFrame(bivec.fit_transform(df['text']).todense(), columns=bivec.get_feature_names())
bigrams['target'] = df['target']

In [None]:
target0_bi = bigrams.groupby('target').sum().T.sort_values(by=0, ascending=False)[[0]].head(20)
target1_bi = bigrams.groupby('target').sum().T.sort_values(by=1, ascending=False)[[0]].head(20)
target2_bi = bigrams.groupby('target').sum().T.sort_values(by=2, ascending=False)[[0]].head(20)
target3_bi = bigrams.groupby('target').sum().T.sort_values(by=3, ascending=False)[[0]].head(20)
target4_bi = bigrams.groupby('target').sum().T.sort_values(by=4, ascending=False)[[0]].head(20)
target5_bi = bigrams.groupby('target').sum().T.sort_values(by=5, ascending=False)[[0]].head(20)

In [None]:
target0 = clean_gram_df(target0_bi)
target1 = clean_gram_df(target1_bi)
target2 = clean_gram_df(target2_bi)
target3 = clean_gram_df(target3_bi)
target4 = clean_gram_df(target4_bi)
target5 = clean_gram_df(target5_bi)


In [None]:
altair_chart(target0)

In [None]:
altair_chart(target1)

In [None]:
altair_chart(target2)

In [None]:
# Because the dataset is so small, there is not enough data to find bivecs.

# LDA

In [None]:
target0 = df[df.target == 0]['text']
target1 = df[df.target == 1]['text']
target2 = df[df.target == 2]['text']
target3 = df[df.target == 3]['text']
target4 = df[df.target == 4]['text']
target5 = df[df.target == 5]['text']
target6 = df[df.target == 6]['text']

In [None]:
def LDA_graph(target_group):
    
    lda = LDA(n_components=3, random_state=42)
    t = cvec.fit_transform(target_group)
    lda_t = lda.fit_transform(t)
    
    return pyLDAvis.sklearn.prepare(lda, t, cvec)

In [None]:
LDA_graph(target0)

In [None]:
LDA_graph(target1)

In [None]:
LDA_graph(target2)

In [None]:
LDA_graph(target3)

In [None]:
LDA_graph(target4)

In [None]:
LDA_graph(target5)

# Sentiment Analysis with Vader

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
def sentiment_analyzer_scores(sentence):
    score = analyzer.polarity_scores(sentence)
    return score 

In [None]:
df['sentiment'] = df.text.apply(sentiment_analyzer_scores)

In [None]:
df_sent = json_normalize(df.sentiment)

In [None]:
df_sent = df_sent.reset_index()

In [None]:
df_sent = df_sent.drop(columns=['index'])

In [None]:
df = df.join((df_sent), how='outer')

In [None]:
# df.to_csv('../data/cleaned/df_with_sentiment.csv')

In [None]:
df = df.drop(columns=['sentiment'])

In [None]:
compound = df.groupby('target', as_index=False)['compound'].mean()
neg = df.groupby('target', as_index=False)['neg'].mean()
neu = df.groupby('target', as_index=False)['neu'].mean()
pos = df.groupby('target', as_index=False)['pos'].mean()

In [None]:
sentiment_chart = pd.concat([compound, neg, neu, pos], axis=1)

In [None]:
sentiment_chart

In [None]:
sentiment_chart = sentiment_chart.loc[:, ~sentiment_chart.columns.duplicated()]

In [None]:
A = alt.Chart(sentiment_chart).mark_bar(color='black').encode(
    x='target:O',
    y='sum(compound):Q'
)

B = alt.Chart(sentiment_chart).mark_bar(color='red').encode(
    x='target:O',
    y='sum(neg):Q'
)

C = alt.Chart(sentiment_chart).mark_bar(color='purple').encode(
    x='target:O',
    y='sum(neu):Q'
)

D = alt.Chart(sentiment_chart).mark_bar().encode(
    x='target:O',
    y='sum(pos):Q'
)

A | B | C | D

In [None]:
alt.Chart(df).mark_point().encode(
    x='target',
    y='compound',
    color='target',
).facet(column='Origin:N')

In [None]:
df.head()

# Naive Model 

In [76]:
X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)



In [77]:
cvec = CountVectorizer(stop_words='english',
                        lowercase=True,
                        ngram_range=(2, 2),
                        strip_accents='unicode')

tvec = TfidfVectorizer(stop_words='english',
                        ngram_range=(1, 3),
                        encoding='utf-8')

In [78]:
X_train_counts = cvec.fit_transform(X_train)
X_test_counts = cvec.transform(X_test)

In [79]:
df_counts = pd.DataFrame(X_train_counts.todense(), columns=cvec.get_feature_names())
df_counts.head()

Unnamed: 0,aadam aziz,abalone shells,abandon benefits,abandon intended,abandon pair,abandoned beach,abandoned case,abandoned evidence,abandoned framework,abandoned heroine,...,zulu burmese,zulu evening,zulu impi,zulu manager,zulu people,zulu wish,zulu woman,zulu wot,zurich staring,zygote lost
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
model1 = LogisticRegression()
model1.fit(X_train_counts, y_train)
y_pred = model1.predict(X_test_counts)
print(model1.score(X_train_counts, y_train))
print(model1.score(X_test_counts, y_test))

1.0
0.4909090909090909


In [81]:
def classification_metrics(y_test, y_pred):
    print(f' Accuracy Score: {accuracy_score(y_test, y_pred)}')
    print(f' Precision Score: {precision_score(y_test, y_pred, average = None)}')
    print(f' Recall Score: {recall_score(y_test, y_pred, average = None)}')
    
classification_metrics(y_test, y_pred)

 Accuracy Score: 0.4909090909090909
 Precision Score: [1.         1.         1.         1.         0.9        0.29487179]
 Recall Score: [0.38888889 0.08333333 0.32       0.375      0.5625     1.        ]


# Imbalance Learn with SMOTE

In [82]:
sm = SMOTE()
X_reb, y_reb = sm.fit_sample(X_train_counts, y_train)

model1.fit(X_reb, y_reb)
print(model1.score(X_reb, y_reb))
print(model1.score(X_test_counts, y_test))

0.8780487804878049
0.5181818181818182


In [83]:
y_pred = model1.predict(X_test_counts)


In [84]:
classification_metrics(y_test, y_pred)

 Accuracy Score: 0.5181818181818182
 Precision Score: [0.38636364 0.42857143 0.88888889 0.69230769 0.38461538 0.90909091]
 Recall Score: [0.94444444 0.25       0.32       0.5625     0.625      0.43478261]


# Regularization

In [85]:
model2 = LogisticRegression(C = 0.001,
                         class_weight = 'balanced',
                         multi_class = 'multinomial',
                         penalty= 'l2',
                         solver= 'sag')
model2.fit(X_reb, y_reb)
print(f'Train score: {model2.score(X_reb, y_reb)}')
print(f'Test score: {model2.score(X_test_counts, y_test)}')

Train score: 0.8170731707317073
Test score: 0.6181818181818182


In [86]:
y_pred = model2.predict(X_test_counts)

In [87]:
classification_metrics(y_test, y_pred)

 Accuracy Score: 0.6181818181818182
 Precision Score: [1.         0.55555556 0.65384615 0.61111111 0.75       0.5       ]
 Recall Score: [0.38888889 0.41666667 0.68       0.6875     0.5625     0.82608696]


# Further Exploration of Preds

In [88]:
comparison = pd.DataFrame(y_pred, y_test)
comparison = comparison.reset_index()

In [89]:
comparison = comparison.rename(columns={0: 'prediction'})

In [90]:
comparison.head()

Unnamed: 0,target,prediction
0,0,5
1,3,2
2,0,0
3,4,4
4,3,2


In [91]:
comparison['prediction'] = comparison['prediction'].astype(int)

In [92]:
comparison['correct'] = np.where(comparison['target'] == comparison['prediction'], 1, 0)

In [93]:
comparison = comparison.rename(columns={0: 'prediction'})

In [94]:
comparison.target = comparison.target.astype(int)

In [95]:
comparison.tail()

Unnamed: 0,target,prediction,correct
105,5,5,1
106,2,2,1
107,3,3,1
108,3,2,0
109,5,5,1


In [96]:
comparison.head()

Unnamed: 0,target,prediction,correct
0,0,5,0
1,3,2,0
2,0,0,1
3,4,4,1
4,3,2,0


In [97]:
comparison['difference'] = abs(comparison.target - comparison.prediction)

In [98]:
comparison.head()

Unnamed: 0,target,prediction,correct,difference
0,0,5,0,5
1,3,2,0,1
2,0,0,1,0
3,4,4,1,0
4,3,2,0,1


In [99]:
borderline_dates = len(comparison[comparison['difference'] <= 1]) / len(comparison['difference'])
borderline_dates

0.7909090909090909