# Iris Dataset Classification - TASK #01

### Importing Libraries

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score,recall_score,precision_score

### Information about data

In [2]:
df = pd.read_csv("./spam.csv",encoding="latin1")

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ã_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [5]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
868,spam,Hello. We need some posh birds and chaps to us...,,,
2280,ham,R Ã_ comin back for dinner?,,,
4561,ham,How much u trying to get?,,,
3698,ham,How i noe... Did Ã_ specify da domain as nuss...,,,
3314,spam,FREE MESSAGE Activate your 500 FREE Text Messa...,,,


Here we have 
`v1` and `v2` and some other unnamed columns

In `v2` there is mail content and in `v1` it is labelled that if the respective email is spam(`spam`) or not (`ham`)

In [9]:
# Removing useless columns
df = df[['v1','v2']]

In [10]:
# info of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [11]:
# checking if there is any null value
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [13]:
# changing columns names and spam values
df.rename(columns={'v1':'spam/ham','v2':'sms'},inplace=True)

df.loc[df['spam/ham'] == 'spam', 'spam/ham',] = 0
df.loc[df['spam/ham'] == 'ham', 'spam/ham',] = 1

In [21]:
df.head()

Unnamed: 0,spam/ham,sms
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


### Train Test Split

In [22]:
X = df['sms']
y = df['spam/ham']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=3)

In [24]:
X_test.head()

2632                       I WILL CAL YOU SIR. In meeting
454     Loan for any purpose Ã¥Â£500 - Ã¥Â£75,000. Hom...
983     LOOK AT THE FUCKIN TIME. WHAT THE FUCK YOU THI...
1282    Ever green quote ever told by Jerry in cartoon...
4610                                 Wat time Ã_ finish?
Name: sms, dtype: object

In [25]:
y_test.head()

2632    1
454     0
983     1
1282    1
4610    1
Name: spam/ham, dtype: object

### Vectorizer

In [28]:
vect = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
vect

In [29]:
# changing datatypes
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [31]:
xtrain_v = vect.fit_transform(X_train)
xtest_v = vect.transform(X_test)
print(X_train)

3075    Mum, hope you are having a great day. Hoping t...
1787                           Yes:)sura in sun tv.:)lol.
1614    Me sef dey laugh you. Meanwhile how's my darli...
4304                Yo come over carlos will be here soon
3266                    Ok then i come n pick u at engin?
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class.
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
Name: sms, Length: 4457, dtype: object


### Training Model

In [32]:
model = LogisticRegression()

In [34]:
# fitting data info model
model.fit(xtrain_v,y_train)

In [36]:
# checking score
model.score(xtrain_v,y_train)

0.9661207089970832

In [37]:
model.score(xtest_v,y_test)

0.9623318385650225

#### Prediction

In [38]:
y_pred = model.predict(xtest_v)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [40]:
accuracy_score(y_test,y_pred)

0.9623318385650225

In [41]:
confusion_matrix(y_test,y_pred)

array([[114,  41],
       [  1, 959]], dtype=int64)

In [42]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.74      0.84       155
           1       0.96      1.00      0.98       960

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115



According to score our model is perfoming preety good :)