In [4]:
# Importing Libraries
import pandas as pd
from pandas import read_csv

## Dealing With Categorical Data

In [6]:
# Loading Dataset
filename = "dataset/breastcancers.csv"
data = read_csv(filename)
data.head()

Unnamed: 0,AGE,MENOPAUSE,TUMOUR_SIZE,INV_NODES,NODE_CAPS,DEG_MALIG,BREAST,BREAST_QUAD,IRRIDIAT,CLASS
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'


In [3]:
data.dtypes

'40-49'                object
'premeno'              object
'15-19'                object
'0-2'                  object
'yes'                  object
'3'                    object
'right'                object
'left_up'              object
'no'                   object
'recurrence-events'    object
dtype: object

In [16]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
def categorical_transform(data):
    le = preprocessing.LabelEncoder()
    data.AGE = le.fit_transform(data.AGE)
    data.MENOPAUSE = le.fit_transform(data.MENOPAUSE)
    data.TUMOUR_SIZE = le.fit_transform(data.TUMOUR_SIZE)
    data.INV_NODES = le.fit_transform(data.INV_NODES)
    data.NODE_CAPS = le.fit_transform(data.NODE_CAPS)
    data.DEG_MALIG = le.fit_transform(data.DEG_MALIG)
    data.BREAST = le.fit_transform(data.BREAST)
    data.BREAST_QUAD = le.fit_transform(data.BREAST_QUAD)
    data.IRRIDIAT = le.fit_transform(data.IRRIDIAT )
    data.CLASS = le.fit_transform(data.CLASS)
    

In [17]:
categorical_transform(data)

In [21]:
data.head()

Unnamed: 0,AGE,MENOPAUSE,TUMOUR_SIZE,INV_NODES,NODE_CAPS,DEG_MALIG,BREAST,BREAST_QUAD,IRRIDIAT,CLASS
0,2,2,2,0,1,2,1,2,0,1
1,3,0,2,0,0,0,1,0,0,0
2,3,0,6,0,0,1,0,1,0,1
3,2,2,6,0,1,2,1,1,1,0
4,2,2,5,4,1,1,0,4,0,1


In [19]:
data.dtypes

AGE            int64
MENOPAUSE      int64
TUMOUR_SIZE    int64
INV_NODES      int64
NODE_CAPS      int64
DEG_MALIG      int64
BREAST         int64
BREAST_QUAD    int64
IRRIDIAT       int64
CLASS          int64
dtype: object

In [23]:
data.to_csv("dataset/clean_breastcancers.csv")

## Dealing With Text

In [26]:
filename = "dataset/comments.csv"
df = read_csv(filename)
df.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [84]:
# Feature Selection
dataframe = df[['CONTENT','CLASS']]
df_x = dataframe['CONTENT']
df_y = dataframe['CLASS']

In [85]:
df_x.head()

0    Huh, anyway check out this you[tube] channel: ...
1    Hey guys check out my new channel and our firs...
2               just for test I have to say murdev.com
3     me shaking my sexy ass on my channel enjoy ^_^ ﻿
4              watch?v=vtaRGgvGtWQ   Check this out .﻿
Name: CONTENT, dtype: object

In [76]:
df_y.head()

Unnamed: 0,CLASS
0,1
1,1
2,1
3,1
4,1


In [86]:
# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer
corpus = df_x
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [88]:
# Train Test Split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df_y, test_size=0.3, random_state=42)

In [91]:
# Fit The Model
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9714285714285714

In [102]:
# Cross Validation
from sklearn.model_selection import cross_val_score
v = cross_val_score(clf, X_train, y_train, cv=10)
for i in range(5):
    print("Accuracy of clf is : {0:2%}".format(v[i,]))
print("Mean Accuracy is ", v.mean())

Accuracy of clf is : 96.000000%
Accuracy of clf is : 96.000000%
Accuracy of clf is : 92.000000%
Accuracy of clf is : 96.000000%
Accuracy of clf is : 100.000000%
Mean Accuracy is  0.9674999999999999


In [111]:
# Dealing With Model
import joblib as joblib
# Save the model
joblib.dump(clf, "model/comment_model.pkl")
print("Model Dumped In Disk")

# Load the model
comment_model = joblib.load("model/comment_model.pkl")
print("Model Loaded Successfully From Disk")

# Test the model
print("")
print("Model Prediction")
comment_model.predict(X_test)

Model Dumped In Disk
Model Loaded Successfully From Disk

Model Prediction


array([1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

## Data With Number In Words

In [57]:
filename = "dataset/sales.csv"
dataset = read_csv(filename)
dataset.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,,2,500,300
1,,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350


In [58]:
# Handling Missing Values
dataset['rate'].fillna(0, inplace=True)


In [59]:
dataset.head(10)

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,four,600,200,400
3,nine,450,320,650
4,seven,600,250,350
5,five,550,200,700


In [60]:
# Feature Selection
x = dataset.iloc[:, :3] 
y = dataset.iloc[:, -1:]

In [61]:
# Converting Words To Number
def convert_to_number(dataset):
    word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, 'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 'zero':0, 0:0}
    return word_dict[dataset]

In [64]:
x['rate'] = x['rate'].apply(lambda x : convert_to_number(x))

In [65]:
x.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month
0,0,2,500
1,0,4,300
2,4,600,200
3,9,450,320
4,7,600,250


In [112]:
y.head()

Unnamed: 0,sales_in_third_month
0,300
1,650
2,400
3,650
4,350


In [114]:
# Contatenation Of Dataframe
info = pd.concat([x,y], axis=1)
info.head()

Unnamed: 0,rate,sales_in_first_month,sales_in_second_month,sales_in_third_month
0,0,2,500,300
1,0,4,300,650
2,4,600,200,400
3,9,450,320,650
4,7,600,250,350
