In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# to make this notebook's output stable across runs
np.random.seed(42)

In [19]:
import matplotlib.pyplot as plt
import pandas as pd

# read data from CSV file to dataframe
words = pd.read_csv('./unigram_freq.csv')
answers = pd.read_csv('./answers.csv')
other_word = pd.read_csv('./other_words.csv')

In [20]:
# make sure you understand the type of the object
print(type(words))
print(type(answers))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [21]:
# check the top five and the bottom five data tuples
display(words.head())
display(words.tail())
display(words.shape)

display(answers.head())
display(answers.tail())
display(answers.shape)

display(other_word.head())
display(other_word.tail())
display(other_word.shape)

Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698


Unnamed: 0,word,count
333328,gooek,12711
333329,gooddg,12711
333330,gooblle,12711
333331,gollgo,12711
333332,golgw,12711


(333333, 2)

Unnamed: 0,word
0,cigar
1,rebut
2,sissy
3,humph
4,awake


Unnamed: 0,word
2310,judge
2311,rower
2312,artsy
2313,rural
2314,shave


(2315, 1)

Unnamed: 0,word
0,aahed
1,aalii
2,aargh
3,aarti
4,abaca


Unnamed: 0,word
10652,zuzim
10653,zygal
10654,zygon
10655,zymes
10656,zymic


(10657, 1)

In [31]:
#Joining the dataset together
df_other = other_word.join(words.set_index('word'), on='word')
df_ans = answers.join(words.set_index('word'), on='word')

In [32]:
#Displaying other word data
display(df_other.head())
display(df_other.tail())
display(df_other.describe())
df_other.info()

Unnamed: 0,word,count
0,aahed,
1,aalii,
2,aargh,71592.0
3,aarti,63273.0
4,abaca,79841.0


Unnamed: 0,word,count
10652,zuzim,
10653,zygal,
10654,zygon,15300.0
10655,zymes,24137.0
10656,zymic,30459.0


Unnamed: 0,count
count,5778.0
mean,2709493.0
std,16128170.0
min,12716.0
25%,38762.5
50%,109200.0
75%,483744.5
max,347710200.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10657 entries, 0 to 10656
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   word    10657 non-null  object 
 1   count   5778 non-null   float64
dtypes: float64(1), object(1)
memory usage: 166.6+ KB


In [33]:
#Displaying answers data
display(df_ans.head())
display(df_ans.tail())
display(df_ans.describe())
df_ans.info()

Unnamed: 0,word,count
0,cigar,3993843.0
1,rebut,235254.0
2,sissy,1410911.0
3,humph,60767.0
4,awake,3230337.0


Unnamed: 0,word,count
2310,judge,30905832.0
2311,rower,239796.0
2312,artsy,333481.0
2313,rural,30998929.0
2314,shave,2580057.0


Unnamed: 0,count
count,2313.0
mean,20565750.0
std,67284150.0
min,16161.0
25%,510507.0
50%,2171107.0
75%,9764137.0
max,1226734000.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2315 entries, 0 to 2314
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   word    2315 non-null   object 
 1   count   2313 non-null   float64
dtypes: float64(1), object(1)
memory usage: 36.3+ KB


In [36]:
#Normalizing the count in the word
df_other['Normalized_count'] = (df_other['count'] - df_other['count'].min()) / (df_other['count'].max() - df_other['count'].min())

#Printing the head
display(df_other.head())

Unnamed: 0,word,count,Normalized_count
0,aahed,,
1,aalii,,
2,aargh,71592.0,0.000169
3,aarti,63273.0,0.000145
4,abaca,79841.0,0.000193


In [37]:
#Sortting the values by normalized count
df_other = df_other.sort_values(by=['Normalized_count'], ascending = False)

#Printing the head
display(df_other.head())

Unnamed: 0,word,count,Normalized_count
1065,books,347710184.0,1.0
5194,links,339926541.0,0.977614
10436,years,337841309.0,0.971616
4386,items,330505325.0,0.950518
3363,games,305930896.0,0.87984


In [38]:
#Normalizing the count in the word
df_ans['Normalized_count'] = (df_ans['count'] - df_ans['count'].min()) / (df_ans['count'].max() - df_ans['count'].min())

#Printing the head
display(df_ans.head())

Unnamed: 0,word,count,Normalized_count
0,cigar,3993843.0,0.003243
1,rebut,235254.0,0.000179
2,sissy,1410911.0,0.001137
3,humph,60767.0,3.6e-05
4,awake,3230337.0,0.00262
