# A  basic data cleaning project, using loops mostly .


In [27]:
#importing all the necessary libraries
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier


In [28]:
import warnings
warnings.filterwarnings("ignore")

In [29]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
#uploading and reading the dataset
df=pd.read_csv('/content/dataset.csv',encoding='latin-1')
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [51]:
#for the proper use of the dataset ,we need to make sure it is cleaned for the model
# here we are droping the unwanted colums and renaming the others for our functionality
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.rename(columns={"v1": "label", "v2": "text"}, inplace=True)

In [52]:
df.head() 

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
# here label is termed in two ways naming spam and ham,
# we need to change this to numerical for ie ham=0 and spam =1
for i in df.index:
  if df['label'][i]=='ham':
    df['label'][i]=0

  else:
    df['label'][i]=1


In [54]:
#checking the dataframe
df.head()


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [55]:
#checking if there any duplicataed row
df.duplicated().value_counts()

False    5169
True      403
dtype: int64

In [56]:
#there are 403 duplicates row so we need to drop those
df=df.drop_duplicates()
#checking 
df.duplicated().value_counts()
#now all the duplicated values had been removed

False    5169
dtype: int64

## Data preprocessing- Removing puctution and digits

In [57]:

import string
df['cleaned_txt'] = ""
for i in df.index:
    updated_list = []
    for j in range(len(df['text'][i])):
        if df['text'][i][j] not in string.punctuation:
            if df['text'][i][j].isdigit() == False:
                updated_list.append(df['text'][i][j])
    updated_string = "".join(updated_list)
    df['cleaned_txt'][i] = updated_string

#removing the existing test column
df.drop(['text'], axis=1, inplace=True)

df.head()

Unnamed: 0,label,cleaned_txt
0,0,Go until jurong point crazy Available only in ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in a wkly comp to win FA Cup final...
3,0,U dun say so early hor U c already then say
4,0,Nah I dont think he goes to usf he lives aroun...


## Tokenization and coverting to lower case

In [58]:
import re
df['token']=''
for i in df.index:
  df['token'][i]=re.split("\W+",df['cleaned_txt'][i].lower()) #also coverting it to lower case


df.head()

Unnamed: 0,label,cleaned_txt,token
0,0,Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,0,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in a wkly comp to win FA Cup final...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,0,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


## Stop words

In [59]:
df['token_updated']=''
stopwords=nltk.corpus.stopwords.words('english')
for i in df.index:
  new_list=[]
  for j in range(len(df.token[i])):
    if df['token'][i][j] not in stopwords:
      new_list.append(df['token'][i][j])
  df['token_updated'][i]=new_list

#now removing the existing token column
df.drop(['token'],axis=1,inplace=True)

df.head()

Unnamed: 0,label,cleaned_txt,token_updated
0,0,Go until jurong point crazy Available only in ...,"[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in a wkly comp to win FA Cup final...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor U c already then say,"[u, dun, say, early, hor, u, c, already, say]"
4,0,Nah I dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t..."


## Lemmentization

In [62]:
nltk.download('omw-1.4')
df['lematized']=''
lem=nltk.WordNetLemmatizer()
for i in df.index:
  new_list=[]
  for j in range(len(df['token_updated'][i])):
    new_list.append(lem.lemmatize(df['token_updated'][i][j]))

  df['lematized'][i]=new_list

df.drop(['token_updated'],axis=1,inplace=True)

df.head()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,label,cleaned_txt,lematized
0,0,Go until jurong point crazy Available only in ...,"[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in a wkly comp to win FA Cup final...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor U c already then say,"[u, dun, say, early, hor, u, c, already, say]"
4,0,Nah I dont think he goes to usf he lives aroun...,"[nah, dont, think, go, usf, life, around, though]"


## Merging the token

In [63]:
df['cleaned_string']=''
for i in df.index:
  new_string=" ".join(df['lematized'][i])
  df['cleaned_string'][i]=new_string

# droping the lematized column
df.drop(['lematized'],axis=1,inplace=True)

df.head(10)

Unnamed: 0,label,cleaned_txt,cleaned_string
0,0,Go until jurong point crazy Available only in ...,go jurong point crazy available bugis n great ...
1,0,Ok lar Joking wif u oni,ok lar joking wif u oni
2,1,Free entry in a wkly comp to win FA Cup final...,free entry wkly comp win fa cup final tkts st ...
3,0,U dun say so early hor U c already then say,u dun say early hor u c already say
4,0,Nah I dont think he goes to usf he lives aroun...,nah dont think go usf life around though
5,1,FreeMsg Hey there darling its been weeks now ...,freemsg hey darling week word back id like fun...
6,0,Even my brother is not like to speak with me T...,even brother like speak treat like aid patent
7,0,As per your request Melle Melle Oru Minnaminun...,per request melle melle oru minnaminunginte nu...
8,1,WINNER As a valued network customer you have b...,winner valued network customer selected receiv...
9,1,Had your mobile months or more U R entitled t...,mobile month u r entitled update latest colour...


In [64]:
df.drop(['cleaned_txt'],axis=1,inplace=True)
df.head()

Unnamed: 0,label,cleaned_string
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c already say
4,0,nah dont think go usf life around though


Now the data is cleaned and pre-processed for the further use.


In [66]:
df.to_csv('cleaned_dataset_spam_dectection')

In [67]:
# seperating target and features
y = pd.DataFrame(df.label)
x = df.drop(['label'], axis=1)

In [68]:
# countvectorization
count = CountVectorizer(max_features=5000)
temp1 = count.fit_transform(x['cleaned_string'].values.astype('U')).toarray()
tf = TfidfTransformer()
temp1 = tf.fit_transform(temp1)
temp1 = pd.DataFrame(temp1.toarray(), index=x.index)
x = pd.concat([x, temp1], axis=1, sort=False)

In [70]:
# drop final_text col
x.drop(['cleaned_string'], axis=1, inplace=True)

In [71]:
# converting to int datatype
y = y.astype(int)

In [72]:
# creating a random classsifier model
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(x, y)

RandomForestClassifier(random_state=0)

In [73]:
model.score(x,y)

1.0