In [2]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re

## This is a function that preprocesses our articles

In [3]:
import csv
import math
import random

In [4]:
def load_CSV(filename):
    lines = csv.reader(open(r'C:\Users\Percy Mohlala\Desktop\diabetes.csv'))
    next(lines)
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset   

In [5]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) <trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
        
    return [trainSet, copy]

In [6]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if(vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
        
    return separated

In [7]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [8]:
def stdDev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers) - 1)
    return math.sqrt(variance)

In [9]:
def summarize(dataset):
    summaries = [(mean(attribute), stdDev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [10]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    
    return summaries

In [11]:
def calcProb(x, mu,stdDev):
    exponent = math.exp(-(math.pow(x-mu,2) / (2*math.pow(stdDev,2))))

In [12]:
def calcClassProbabilities(summaries, inputvector):
    probabilitis = {}
    for classValue, classSummaries in summaries.items():
        probabilitis[classValue] = 1
        
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputvector[i]
            
            probabilitis[classValue] *= calcProb(x, mean, stdDev)
            
        return probabilitis

In [13]:
def predict(summaries, inputVector):
    probabilities = calcClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    
    return bestLabel

In [14]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[1])
        predictions.append(result)
        
    return predictions

In [15]:
def getAccuracy(testSet, predictons):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictons[x]:
            correct += 1
            
    return(correct/float(len(testSet)))*100.0

In [16]:
from sklearn.preprocessing import LabelEncoder

def main():
    filename = 'bbc_news.csv'
    splitRatio = 0.67
    dataset = load_CSV(filename)
    
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    
    
    print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset),len(trainingSet),len(testSet)))
    
    #prepare model
    summaries = summarizeByClass(trainingSet)
    
    #test model
    predictions = getPredictions(summaries, testSet
                                )
    #accuracy = getAccuracy(testSet, predictions)
    #print('Accuracy: {0}%'.format(accuracy))
    
main()

Split 768 rows into train = 514 and test = 254 rows


TypeError: must be real number, not function

In [17]:
col_names = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

train_raw = pd.read_csv('D:/Dataset/train.csv')
test_raw = pd.read_csv('D:/Dataset/test.csv')

test_IDs = test_raw['PassengerId'].values

train_raw['train'] = 1
test_raw['train'] = 0

data = train_raw.append(test_raw, sort = False)

In [18]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [19]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


## Preprocessing the data, feature selection, data cleaning, feature engineering and data imputation

In [20]:
features = ['Age', 'Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp']
target = 'Survived'

data = data[features + [target] + ['train']]

#transforming categorical values to numeric

#data['Sex'] = data['Sex'].replace(["female", "male"], [0, 1])
data['Sex'] = data['Sex'].map({'female':0, 'male':1})
data['Embarked'] = data['Embarked'].map({'S':1, 'C':2, 'Q':3})
data['Age'] = pd.qcut(data['Age'].rank(method='first'), 10, labels = False)

data.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,train
0,2.0,1.0,7.25,0,3,1,1,0.0,1
1,7.0,2.0,71.2833,0,1,0,1,1.0,1
2,4.0,1.0,7.925,0,3,0,0,1.0,1
3,6.0,1.0,53.1,0,1,0,1,1.0,1
4,6.0,1.0,8.05,0,3,1,0,0.0,1


## splitting the data

In [21]:
train = data[data.train == 1]
test = data[data.train == 0]

#drop missing values from train set

train.dropna(inplace=True)
labels = train[target].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [51]:
train.head(5)

Unnamed: 0,Age,Embarked,Fare,Parch,Sex,SibSp
0,2.0,1.0,7.25,0,1,1
1,7.0,2.0,71.2833,0,0,1
2,4.0,1.0,7.925,0,0,0
3,6.0,1.0,53.1,0,0,1
4,6.0,1.0,8.05,0,1,0


In [52]:
test.isnull().sum()

Age         86
Embarked     0
Fare         1
Parch        0
Sex          0
SibSp        0
dtype: int64

In [53]:
train.drop(['train', target, 'Pclass'], axis=1, inplace=True)
test.drop(['train', target, 'Pclass'], axis=1, inplace=True)



KeyError: "['train' 'Survived' 'Pclass'] not found in axis"

In [54]:
train.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Sex,SibSp
0,2.0,1.0,7.25,0,1,1
1,7.0,2.0,71.2833,0,0,1
2,4.0,1.0,7.925,0,0,0
3,6.0,1.0,53.1,0,0,1
4,6.0,1.0,8.05,0,1,0


In [55]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2, random_state=1)

In [56]:
X_train.head(5)

Unnamed: 0,Age,Embarked,Fare,Parch,Sex,SibSp
830,1.0,2.0,14.4542,0,0,1
566,2.0,1.0,7.8958,0,1,0
149,7.0,1.0,13.0,0,1,0
106,2.0,1.0,7.65,0,0,0
290,4.0,1.0,78.85,0,0,0


In [57]:
X_train.head(15)

Unnamed: 0,Age,Embarked,Fare,Parch,Sex,SibSp
830,1.0,2.0,14.4542,0,0,1
566,2.0,1.0,7.8958,0,1,0
149,7.0,1.0,13.0,0,1,0
106,2.0,1.0,7.65,0,0,0
290,4.0,1.0,78.85,0,0,0
791,1.0,1.0,26.0,0,1,0
851,9.0,1.0,7.775,0,1,0
136,1.0,1.0,26.2833,2,0,0
20,6.0,1.0,26.0,0,1,0
877,2.0,1.0,7.8958,0,1,0


In [58]:
X_train.isnull().sum()

Age         0
Embarked    0
Fare        0
Parch       0
Sex         0
SibSp       0
dtype: int64

In [59]:
X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, y_train, test_size=0.3, random_state=12)

In [74]:
class multiNB:
    
    def __init__(self, alpha=0.1):
        self.alpha = alpha
       
        
    def fit(self, X_train, y_train):
        
        samples_no, no_features = X_train.shape #get the number of rows and columns using the .shape method
        
        
        self.classes_= np.unique(y_train) #get the number of unique classes in our dataset
        
        no_classes = len(self.classes_)
        
        #inititalising our priors and likelihoods
        
        self.priors_ = np.zeros(no_classes)
        self.likelihoods_ = np.zeros((no_classes, no_features))
        
        #here we're finding our priors and likelihoods
        
        for indx, single_class in enumerate(self.classes_):
            X_train_clss = X_train[single_class == y_train]
            self.priors_[indx] = (X_train_clss.shape[0]/ float(samples_no)) #X_class.shape[0] gets the number of samples with single_class as the label
            self.likelihoods_[indx, :] = ((X_train_clss.sum(axis=0)) + self.alpha) / (np.sum(X_train_clss.sum(axis=0) + self.alpha)) #we include alpha here to smooth our table of likelihoods
            
      
    def predictSinglePoint(self, x):
        #calculate the posterior probability
        
        posteriors = []    
        
        #x = float(x)
        
        for indx, single_class in enumerate(self.classes_): #get the index and class labels using the enumerate function
            prior =np.log(self.priors_[indx]) #we use the priors we calculated in our fit method
        
            #calculate the likelihood and the prior for each class 
            likelihood_class = np.log(self.likelihoods_[indx, :])
            
            total = np.sum(likelihood_class) #sum all our class likelihoods
            posterior = prior + total
            
            posteriors.append(posterior)
            
        #choose the class with the highest probability using the inbuilt numpy argmax function
        
        output = self.classes_[np.argmax(posteriors)]
        
        return output
    
    def predict(self, X_test):
        result = []
        
        for x in X_test:
            prediction = self.predictSinglePoint(x) #here we get the prediction for each given row
            result.append(prediction)               #append the prediction to our list of results
        
        return result
    
    def acc_score(self, X_test, y_test):
        
        y_predicted = self.predict(X_test)
        score = float(np.sum(y_predicted==y_test)/len(y_test))
        
        return score

In [75]:
clsf = multiNB()



In [76]:
from sklearn import datasets

X,y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123)
x_Train, x_Test, Y_train, Y_test = train_test_split(X,y, test_size=0.2, random_state=123)
clsf.fit(X_train, y_train)

In [77]:
predictions = clsf.predict(X_train1)
X_test.dtypes

Age         float64
Embarked    float64
Fare        float64
Parch         int64
Sex           int64
SibSp         int64
dtype: object

In [78]:
predictions

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]