In [103]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,confusion_matrix


### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [104]:
# Code starts here
data=pd.read_csv("train.csv")
data.head()
# Code ends here

Unnamed: 0,Id,TITLE,CATEGORY
0,50846,Ukraine to get $18 billion rescue from IMF,b
1,234375,McDonald's Abandons Headquarters to Avoid Prot...,b
2,63422,New study finds evidence that Autism begins in...,m
3,353942,Prime Minister Modi Says Meeting With Facebook...,t
4,311586,New robot guides at Tokyo museum almost outper...,t


In [105]:
data['CATEGORY'].value_counts()

e    122013
b     92679
t     86846
m     36397
Name: CATEGORY, dtype: int64

In [106]:
data=data.drop("Id",axis=1)
data.head()

Unnamed: 0,TITLE,CATEGORY
0,Ukraine to get $18 billion rescue from IMF,b
1,McDonald's Abandons Headquarters to Avoid Prot...,b
2,New study finds evidence that Autism begins in...,m
3,Prime Minister Modi Says Meeting With Facebook...,t
4,New robot guides at Tokyo museum almost outper...,t


In [107]:
data.isnull().sum()

TITLE       0
CATEGORY    0
dtype: int64

In [108]:

data.head()

Unnamed: 0,TITLE,CATEGORY
0,Ukraine to get $18 billion rescue from IMF,b
1,McDonald's Abandons Headquarters to Avoid Prot...,b
2,New study finds evidence that Autism begins in...,m
3,Prime Minister Modi Says Meeting With Facebook...,t
4,New robot guides at Tokyo museum almost outper...,t


### Visualize and Preprocess the data

- Retaining only alphabets (Using regular expressions)
- Removing stopwords (Using nltk library)

In [109]:
# Code starts here
## 1.Retaining only alphabets
data['TITLE']=data['TITLE'].apply(lambda x: re.sub("[^a-zA-Z]", " ",x))

# 2. convert to lowercase and tokenize
data['TITLE'] = data['TITLE'].apply(lambda x:x.lower().split())

## 3.Removing stopwords
stop = set(stopwords.words('english'))
data['TITLE'] = data['TITLE'].apply(lambda x:[i for i in x if i not in stop])

## 4.join list elements
data['TITLE'] = data['TITLE'].apply(lambda x:' '.join(x))

In [110]:
# Code starts here
data.head()
# Code ends here

Unnamed: 0,TITLE,CATEGORY
0,ukraine get billion rescue imf,b
1,mcdonald abandons headquarters avoid protesters,b
2,new study finds evidence autism begins womb,m
3,prime minister modi says meeting facebook coo ...,t
4,new robot guides tokyo museum almost outperfor...,t


### Model building

- Now let's come to the actual task, using any classifier, predict the `CATEGORY`. Use different techniques you have learned to imporove the performance of the model.
- Try improving upon the `accuracy_score` ([Accuracy Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))

In [111]:
# Code starts here
# split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(data['TITLE'], data['CATEGORY'], test_size=0.2, random_state=3)

# initialize count vectorizer
count_vectorizer = CountVectorizer()

# initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer()

# fit and transform with count vectorizer
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# fit and transform with tfidf vectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



# Code ends here

In [112]:
# initialize multinomial naive bayes
nb_1 = MultinomialNB()
nb_2 = MultinomialNB()

# fit on count vectorizer training data
nb_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
nb_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_nb = accuracy_score(nb_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), Y_test)

# display accuracies
print("acc_count_nb: {},\n acc_tfidf_nb: {}".format(acc_count_nb, acc_tfidf_nb))

acc_count_nb: 0.9272641188394218,
 acc_tfidf_nb: 0.925592199683371


In [113]:
import warnings
warnings.filterwarnings('ignore')
# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))

# fit on count vectorizer training data
logreg_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
logreg_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_logreg =  accuracy_score(logreg_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_logreg =  accuracy_score(logreg_2.predict(X_test_tfidf), Y_test)

# display accuracies
print("acc_count_logreg: {}, acc_tfidf_logreg: {}".format(acc_count_logreg, acc_tfidf_logreg))

acc_count_logreg: 0.9461730806220131, acc_tfidf_logreg: 0.941941497625283


### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [114]:
# Code Starts here
# Prediction on test data


test=pd.read_csv("test.csv")
id=test["Id"].copy()
test=test.drop("Id",axis=1)
# Code starts here
## 1.Retaining only alphabets
test['TITLE']=test['TITLE'].apply(lambda x: re.sub("[^a-zA-Z]", " ",x))

# 2. convert to lowercase and tokenize
test['TITLE'] = test['TITLE'].apply(lambda x:x.lower().split())

## 3.Removing stopwords
stop = set(stopwords.words('english'))
test['TITLE'] = test['TITLE'].apply(lambda x:[i for i in x if i not in stop])

## 4.join list elements
test['TITLE'] = test['TITLE'].apply(lambda x:' '.join(x))
test.head()
# Code ends here

Unnamed: 0,TITLE
0,simple blood test detect solid cancers
1,mozilla appoints veteran chris beard interim ceo
2,fda abruptly reverses stance wooden aging boar...
3,cancer stats confirm value colonoscopy
4,apple samsung playing games citing big numbers...


In [122]:


test_count = count_vectorizer.transform(test["TITLE"])
test_count





<84484x42616 sparse matrix of type '<class 'numpy.int64'>'
	with 568631 stored elements in Compressed Sparse Row format>

In [123]:

y_pred = logreg_1.predict(test_count)
y_pred

array(['m', 't', 'm', ..., 'e', 't', 'e'], dtype='<U1')

In [124]:

sample_submission = pd.DataFrame({'':id,'CATEGORY':y_pred})

# Convert the sample submission file into a csv file
sample_submission.to_csv('1st_submission.csv',index=False)