In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics 
import sklearn.svm as svm 
import sklearn.naive_bayes as naive_bayes
from sklearn.linear_model import LogisticRegression 
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB, BernoulliNB
import seaborn as sns 


In [43]:
main_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

main_df = main_df.append(test_df)

# 1. PreProcessing

In [44]:
main_df.head()

Unnamed: 0,url,webpageDescription,alchemy_category,alchemy_category_score,avgLinkWordLength,AvglinkWithOneCommonWord,AvglinkWithTwoCommonWord,AvglinkWithThreeCommonWord,AvglinkWithFourCommonWord,redundancyMeasure,...,lengthyDomain,hyperlinkToAllWordsRatio,isFrontPageNews,alphanumCharCount,linksCount,wordCount,parametrizedLinkRatio,spellingErrorsRatio,label,id
0,http://www.polyvore.com/cgi/home?id=1389651,"{""title"":""Join Polyvore and follow the latest ...",?,?,1.916667,0.047619,0.007937,0.0,0.0,0.803797,...,0,34,0,682,126,1,0.531746,0.142857,1.0,3711
1,http://www.youtube.com/watch?v=ippMPPu6gh4,"{""body"":""Speed Air Man--David Belle david bell...",?,?,1.257576,0.141026,0.0,0.0,0.0,1.142857,...,0,12,0,3008,78,1,0.628205,0.0,1.0,7222
2,http://www.musingsofahousewife.com/2011/03/tri...,"{""title"":""Chicken Gruyere Recipe "",""body"":""Chi...",science_technology,0.386685,2.024,0.63035,0.284047,0.233463,0.202335,0.443409,...,1,17,0,11008,257,5,0.466926,0.0625,1.0,3964
3,http://www.thelittleteochew.com/2011/07/ikan-b...,"{""title"":""Ikan Bilis With Tempeh Groundnuts "",...",recreation,0.475039,1.665254,0.41958,0.199301,0.097902,0.066434,0.472649,...,1,28,0,4585,286,5,0.244755,0.085868,1.0,3697
4,http://recipes.wuzzle.org/index.php/72,"{""url"":""recipes wuzzle org index php 72"",""titl...",computer_internet,0.535009,0.181818,0.036364,0.0,0.0,0.0,0.292614,...,0,3,0,1745,55,1,0.072727,0.115044,1.0,4321


## 1.1 Checking for NULL and "?" values

In [45]:
main_df.isna().sum()

url                              0
webpageDescription               0
alchemy_category                 0
alchemy_category_score           0
avgLinkWordLength                0
AvglinkWithOneCommonWord         0
AvglinkWithTwoCommonWord         0
AvglinkWithThreeCommonWord       0
AvglinkWithFourCommonWord        0
redundancyMeasure                0
embedRatio                       0
framebased                       0
frameTagRatio                    0
domainLink                       0
tagRatio                         0
imageTagRatio                    0
isNews                           0
lengthyDomain                    0
hyperlinkToAllWordsRatio         0
isFrontPageNews                  0
alphanumCharCount                0
linksCount                       0
wordCount                        0
parametrizedLinkRatio            0
spellingErrorsRatio              0
label                         1479
id                               0
dtype: int64

No Null Values exist

In [46]:
(main_df == '?').sum()

url                              0
webpageDescription               0
alchemy_category              2342
alchemy_category_score        2342
avgLinkWordLength                0
AvglinkWithOneCommonWord         0
AvglinkWithTwoCommonWord         0
AvglinkWithThreeCommonWord       0
AvglinkWithFourCommonWord        0
redundancyMeasure                0
embedRatio                       0
framebased                       0
frameTagRatio                    0
domainLink                       0
tagRatio                         0
imageTagRatio                    0
isNews                        2843
lengthyDomain                    0
hyperlinkToAllWordsRatio         0
isFrontPageNews               1248
alphanumCharCount                0
linksCount                       0
wordCount                        0
parametrizedLinkRatio            0
spellingErrorsRatio              0
label                            0
id                               0
dtype: int64

In [47]:
(main_df["isNews"] == '?').sum()

2843

In [48]:
main_df["isNews"].value_counts()

1    4552
?    2843
Name: isNews, dtype: int64

Assuming the other 2843 rows are not news making the '?' values to 0

In [49]:
main_df["isNews"].replace("?", 0, inplace = True)

In [50]:
main_df["isNews"].value_counts()

1    4552
0    2843
Name: isNews, dtype: int64

In [51]:
(main_df["isFrontPageNews"] == '?').sum()

1248

In [52]:
main_df["isFrontPageNews"].value_counts()

0    5853
?    1248
1     294
Name: isFrontPageNews, dtype: int64

Assuming they are not front page news changing the "?" to 0

In [53]:
main_df["isFrontPageNews"].replace("?", '0', inplace = True)
main_df["isFrontPageNews"] = pd.to_numeric(main_df["isFrontPageNews"])

In [54]:
main_df["isFrontPageNews"].value_counts()

0    7101
1     294
Name: isFrontPageNews, dtype: int64

In [55]:
(main_df["alchemy_category_score"] == '?').sum()

2342

In [61]:
main_df["alchemy_category_score"].value_counts()

?            2342
0.85           82
0.0784091      40
0.849999       12
0.535009       10
             ... 
0.733288        1
0.703947        1
0.93767         1
0.760163        1
0.715035        1
Name: alchemy_category_score, Length: 4806, dtype: int64

In [65]:
main_df["alchemy_category_score"] = main_df["alchemy_category_score"].replace('?', np.nan)
main_df["alchemy_category_score"] = pd.to_numeric(main_df["alchemy_category_score"])
main_df["alchemy_category_score"].describe()

count    5053.000000
mean        0.603334
std         0.212864
min         0.070833
25%         0.452424
50%         0.625616
75%         0.780851
max         0.999426
Name: alchemy_category_score, dtype: float64

In [67]:
main_df["alchemy_category_score"].isna().sum()

2342

Mean and Median are almost same. So, replacing nan values with Median

In [69]:
main_df["alchemy_category_score"].fillna(main_df["alchemy_category_score"].median(), inplace=True)
main_df["alchemy_category_score"].describe()

count    7395.000000
mean        0.610391
std         0.176257
min         0.070833
25%         0.538758
50%         0.625616
75%         0.708279
max         0.999426
Name: alchemy_category_score, dtype: float64

In [70]:
main_df["alchemy_category"].describe()

count     7395
unique      14
top          ?
freq      2342
Name: alchemy_category, dtype: object

In [71]:
main_df["alchemy_category"].value_counts()

?                     2342
recreation            1229
arts_entertainment     941
business               880
health                 506
sports                 380
culture_politics       343
computer_internet      296
science_technology     289
gaming                  76
religion                72
law_crime               31
unknown                  6
weather                  4
Name: alchemy_category, dtype: int64

Changing '?' to a new unknown

In [72]:
main_df["alchemy_category"].replace("?", "unknown", inplace = True)

In [73]:
(main_df == '?').sum()

url                           0
webpageDescription            0
alchemy_category              0
alchemy_category_score        0
avgLinkWordLength             0
AvglinkWithOneCommonWord      0
AvglinkWithTwoCommonWord      0
AvglinkWithThreeCommonWord    0
AvglinkWithFourCommonWord     0
redundancyMeasure             0
embedRatio                    0
framebased                    0
frameTagRatio                 0
domainLink                    0
tagRatio                      0
imageTagRatio                 0
isNews                        0
lengthyDomain                 0
hyperlinkToAllWordsRatio      0
isFrontPageNews               0
alphanumCharCount             0
linksCount                    0
wordCount                     0
parametrizedLinkRatio         0
spellingErrorsRatio           0
label                         0
id                            0
dtype: int64

----------------- all '?' and null values are removed ---------------