# Importing Data Set from Kaggle

In [None]:
#importing libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from google.colab import files

# Install Kaggle library
!pip install -q kaggle


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
! cp /root/kaggle/kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d tunguz/200000-jeopardy-questions

In [None]:
df = pd.read_csv("/content/200000-jeopardy-questions.zip")

In [None]:
df.tail()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,$2000,This Puccini opera turns on the solution to 3 ...,Turandot
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",$2000,In North America this term is properly applied...,a titmouse
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,$2000,"In Penny Lane, where this ""Hellraiser"" grew up...",Clive Barker
216928,4999,2006-05-11,Double Jeopardy!,QUOTATIONS,$2000,"From Ft. Sill, Okla. he made the plea, Arizona...",Geronimo
216929,4999,2006-05-11,Final Jeopardy!,HISTORIC NAMES,,A silent movie title includes the last name of...,Grigori Alexandrovich Potemkin


In [None]:

df.info()

# Pruing None values from dataset 

df =df[df[' Value'] != 'None']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       216930 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216928 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [None]:

# Converting values in [ ' Value' ] column in to intgers after preprocessing each value.

df[' Value']  = df[' Value'].apply(
    lambda value: (value.replace(',', '').replace('$' , ''))
)


In [None]:
# We need to predict value of the question. 

#Assumptions :
# Value of question does not depend on show number , Air Date , Round , Category , Answer
# We use question as independent variable in predicting the value of the question

# Idea : 

#1) We use bag of model concept. 
#2) And use Binning concept on [value] in order to find categories. 
#3) Then perform a multi-class classfication  using Logistic Regression.

#Data Preprocessing

In [None]:
# columns in dataset

columnsList = df.columns
print(columnsList)

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')


In [None]:
# Creating categories by using binnig concept

# Below funciton will round the value to its nearest hundread or thousand based on the if else condition if falls below

def convert_to_categories(x):
    if int(x) < 1000:
        return np.round(int(x), -2)
    elif int(x) < 10000:
        return np.round(int(x), -3)
    else:
        return np.round(int(x), -4)

df['New_value'] = df[' Value'].apply(convert_to_categories)

In [None]:
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,New_value
0,4680,2004-12-31,Jeopardy!,HISTORY,200,"For the last 8 years of his life, Galileo was ...",Copernicus,200
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,200
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,The city of Yuma in this state has a record av...,Arizona,200
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,200
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,200


In [None]:
# 2D arrays storing ["Values"]  and  ["Question"] columns series from dataset 

X = df.iloc[:,  5].values
y= df.iloc[: ,-1].values

In [None]:
data_

In [None]:
print(X)
print(y)

print(len(X))
print(len(y))

["For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory"
 'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'
 'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'
 ...
 'In North America this term is properly applied to only 4 species that are crested, including the tufted'
 'In Penny Lane, where this "Hellraiser" grew up, the barber shaves another customer--then flays him alive!'
 'From Ft. Sill, Okla. he made the plea, Arizona is my land, my home, my father\'s land, to which I now ask to... return"']
[ 200  200  200 ... 2000 2000 2000]
213296
213296


In [None]:
# Building bag of words

corpus = []

length = df.shape[0]

for i in range(len(X)):
  review = re.sub('[^a-zA-Z]', ' ', X[i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
print(len(corpus))

213296


In [None]:
# Vectorizing the corpus array......!!!

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
X_modified = cv.fit_transform(corpus).toarray()


In [None]:
# Creating train and test data split


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_modified, y, test_size = 0.25, random_state = 0)

In [None]:
# Training the model using Logistic Regression 

from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='multinomial', solver='saga' ,max_iter=200)
model.fit(X_train , y_train)


In [None]:
# Predicting the metrics for the model.

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
accuracy_score = accuracy_score(y_test, y_pred)
y_pred = model.predict(X_test)
print("Acuracy of model" , accuracy_score)
print(classification_report(y_test, y_pred))

Acuracy of model 0.19287750356312355
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
         100       0.08      0.00      0.01      2233
         200       0.18      0.14      0.16      7731
         300       0.05      0.00      0.00      2234
         400       0.21      0.55      0.30     10559
         500       0.05      0.00      0.01      2196
         600       0.14      0.02      0.04      5170
         700       0.00      0.00      0.00        48
         800       0.15      0.10      0.12      7991
         900       0.00      0.00      0.00        28
        1000       0.18      0.20      0.19      8248
        2000       0.23      0.13      0.17      6303
        3000       0.00      0.00      0.00       284
        4000       0.00      0.00      0.00       145
        5000       0.00      0.00      0.00        80
        6000       0.00      0.00      0.00        28
        7000       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
