In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [2]:
data = pd.read_csv("../input/spam-detection/SMSSpamCollection", sep = '\t', names = ["labels", "message"])
data.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer()

corpus=[]
for i in range(len(data)):
    review= re.sub('^[A-Za-z]',' ',data['message'][i])
    review= review.lower()
    review= review.split() #get list of words
    review= [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review= ' '.join(review)
    corpus.append(review)

In [4]:
corpus[1]

'k lar... joking wif u oni...'

In [5]:
cv= CountVectorizer()
X= cv.fit_transform(corpus).toarray() # matrix creation- words as columns, sentences as rows
X.shape #5572 sentences, 8596 unique words in corpus

#8596 is too many columns, take frequent words only (limiting number of columns to 5000)

cv= CountVectorizer(max_features=5000)
X= cv.fit_transform(corpus).toarray() # matrix creation- words as columns, sentences as rows
X.shape

(5572, 5000)

In [6]:
#Convert labels into dummy variables since they are string

y= pd.get_dummies(data['labels']) #split into 2 columns ham and spam
y= y.iloc[:,1].values #take only spam column, as only 1 column is sufficient to represent spam

y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state =0)

In [8]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred = spam_detect_model.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix

confusion_m= confusion_matrix(y_test, y_pred)
confusion_m



array([[944,  11],
       [  7, 153]])

In [10]:
from sklearn.metrics import accuracy_score

accuracy= accuracy_score(y_test, y_pred)
accuracy

0.9838565022421525

In [11]:
from sklearn import datasets
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB


In [12]:
# loading iris dataset
df=datasets.load_iris()


In [13]:
# preparing a model
model=GaussianNB()
model.fit(df.data,df.target)
print(model)

GaussianNB()


In [14]:
expected=df.target
predicted=model.predict(df.data)
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.94      0.94      0.94        50
           2       0.94      0.94      0.94        50

    accuracy                           0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150

[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]


In [15]:
# example of grid searching key hyperparametres for KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.948000 using {'metric': 'euclidean', 'n_neighbors': 17, 'weights': 'uniform'}
0.849667 (0.039706) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.849667 (0.039706) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.895000 (0.037305) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.895000 (0.037305) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.915333 (0.030521) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.915333 (0.030521) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.920000 (0.023238) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.920000 (0.023238) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.924333 (0.026418) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.924333 (0.026418) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.932000 

Q2 step by step

In [16]:
import pandas as pd
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [17]:
x = iris.data
y = iris.target

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state = 43)

In [19]:
import numpy as np
size = x_train.shape[0]
label_prob = {}
for outcome in np.unique(y_train):
  outcome_count = sum(outcome==y_train)
  label_prob[outcome] = outcome_count/size
print(label_prob)

{0: 0.3333333333333333, 1: 0.3142857142857143, 2: 0.3523809523809524}


In [20]:
target_names = list(iris.target_names)
target_names

['setosa', 'versicolor', 'virginica']

In [21]:
df = pd.DataFrame(x_train,columns = iris.feature_names)
df['target'] = y_train
df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.7,2.8,4.5,1.3,1
1,5.5,2.6,4.4,1.2,1
2,5.7,2.6,3.5,1.0,1
3,6.7,3.1,4.4,1.4,1
4,6.3,3.3,4.7,1.6,1


In [22]:
def GaussianProb(mean,var,new_x):
  exp = np.exp((-(new_x - mean)**2)/(2*var))
  start = (1/(np.sqrt(2*np.pi*var))) 
  return start*exp

In [23]:
final_table = []
for i in range(3):
  new_table = []
  temp_df = df[df['target'] == i]
  for feature in iris.feature_names:
    mean,var = temp_df[feature].mean() , temp_df[feature].var()
    new_table.append([mean,var])
  final_table.append(new_table)

In [24]:
final_table[0]

[[5.031428571428571, 0.13457142857142856],
 [3.4342857142857146, 0.14467226890756302],
 [1.4828571428571429, 0.03322689075630252],
 [0.2571428571428571, 0.013109243697478993]]

In [25]:
test = x_test[0]
test

array([4.8, 3.1, 1.6, 0.2])

In [26]:
x_train[:5]

array([[5.7, 2.8, 4.5, 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [5.7, 2.6, 3.5, 1. ],
       [6.7, 3.1, 4.4, 1.4],
       [6.3, 3.3, 4.7, 1.6]])

In [27]:
y_pred = []
for i in range(len(x_test)): 
  test = x_test[i]
  probs = {}
  count = 0
  for each_label in final_table:  
    prob = label_prob[count]  
    feature_count = 0 
    for stats in each_label:  
     prob = prob * GaussianProb(stats[0],stats[1],test[feature_count])
     feature_count+=1 
    count+=1  
    probs[count-1] = prob 
  output =max(probs, key=probs.get)
  y_pred.append(output)

In [28]:
for i in range(len(y_test)):
  print(y_pred[i], y_test[i])

0 0
0 0
2 2
1 1
2 2
0 0
2 2
1 1
1 1
1 1
0 0
1 1
2 2
0 0
1 1
1 1
0 0
0 0
2 2
2 2
0 0
0 0
0 0
2 2
2 2
2 2
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
2 2
2 2
1 1
2 2
1 1
1 1
1 1
2 2
1 1
2 1
0 0


In [29]:
from sklearn.metrics import classification_report
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      0.94      0.97        17
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

