https://www.springboard.com/blog/data-mining-python-tutorial/

http://www.developintelligence.com/blog/2017/08/data-cleaning-pandas-python/


https://machinelearningmastery.com/quick-and-dirty-data-analysis-with-pandas/

Example of simple classification problem:
https://towardsdatascience.com/solving-a-simple-classification-problem-with-python-fruits-lovers-edition-d20ab6b071d2

Cross-validation in sklearn
http://scikit-learn.org/stable/modules/cross_validation.html

## Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import seaborn as sns
from matplotlib import rcParams

from collections import Counter

%matplotlib inline

## Load dataset

In [None]:
#load csv and create a copy to store cleaned values
df = pd.read_csv('ODI-2018.csv',skiprows=[1])
clean = df.copy()

#show first 10 responses
df.head(10)

In [None]:
#Check for null values
df.isnull().any()

In [None]:
#Check for value types
df.dtypes

In [None]:
#show some statistics of the dataset
df.describe()

In [None]:
#Clean first column
#source: https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict
data = df[df.columns[1]].str.lower()
data = data.str.strip()
data = data.str.replace('&','and') 
clean[df.columns[1]] = data.replace(
    {'21-05-1995':np.NaN,
    'a. i.':'Artificial Intelligence',
    'ai':'Artificial Intelligence',
    'ai (cognitive sciences)':'Artificial Intelligence',
    'ai (vu version)':'Artificial Intelligence',
    'ai vu':'Artificial Intelligence',
    'ai premaster':'Artificial Intelligence',
    'artificial intelligence (socially aware computing)':'Artificial Intelligence',
    'artificial intelligence':'Artificial Intelligence',
    'ba':'Business Analytics',
    'b science, business and innovation':'Science, Business & Innovation',
    'big data engineering':'Computer Science',
    'bioinformatcis':'Bioinformatics & Systems Biology',
    'bioinformatics and system biology':'Bioinformatics & Systems Biology',
    'bioinformatics':'Bioinformatics & Systems Biology',
    'bioinformatics and systems biology':'Bioinformatics & Systems Biology',
    'bioinformatics and sysbio':'Bioinformatics & Systems Biology',
    'bioinformatics master':'Bioinformatics & Systems Biology',
    'business analytics msc':'Business Analytics',
    'business analytics/ operations research':'Business Analytics',
    'business analytics':'Business Analytics',
    'cls':'Computational Science',
    'cs':'Computer Science',
    'csl':'Computational Science',
    'comoputational science':'Computational Science',
    'computational science (jd)':'Computational Science',
    'computational science':'Computational Science',
    'computer science':'Computer Science',
    'computer science: big data engineering':'Computer Science',
    'data mining techniques':np.NaN,
    'drug discovery and safety':'Drug Discovery and Safety',
    'duisenberg honors program quantitative risk managament':'Quantitative Risk Management',
    'duisenberg quantitative risk management':'Quantitative Risk Management',
    'econometrics':'Econometrics',
    'econometrics and operations research':'Econometrics and Operations Research',
    'economics':'Economics',
    'eor':'Econometrics and Operations Research',
    'exchange':'Exchange student',
    'finance':'Finance',
    'finance dhp qrm':'Quantitative Risk Management',
    'm financial economtrics':'Econometrics',
    'ma bioinformatics':'Bioinformatics & Systems Biology',
    'master bionformatics and systems biology':'Bioinformatics & Systems Biology',
    'master business analytics':'Business Analytics',
    'master computer science: big data engineering':'Computer Science',
    'master econometrics and operations research':'Econometrics and Operations Research',
    'master human movement science':'Human Movement Sciences',
    'masters computer science(big data engineering)':'Computer Science',
    'mathematics':'Mathematics',
    'mathematics exchange':'Exchange student',
    'mpa':'Management, Policy-Analysis & Entrepreneurship in Health and Life Sciences',
    'ms':np.NaN,
    'msc ai and msc cls':'Artificial Intelligence',
    'msc artificial intelligence':'Artificial Intelligence',
    'msc bioinformatics':'Bioinformatics & Systems Biology',
    'msc bioinformatics and systems biology':'Bioinformatics & Systems Biology',
    'msc computational science':'Computational Science',
    'msc computational science (joint degree)':'Computational Science',
    'msc computer science':'Computer Science',
    'msc econometrics':'Econometrics',
    'msc. bioinformatics and systems biology':'Bioinformatics & Systems Biology',
    'or':'Econometrics and Operations Research',
    'phd':'PhD',
    'phd student':'PhD',
    'phd student at fgb':'PhD',
    'physics':'Physics',
    'qrm':'Quantitative Risk Management',
    'quantitative risk management':'Quantitative Risk Management',
    'system biology and bioinformatics':'Bioinformatics & Systems Biology'
    })
#show number of non-NaN values and number of NaN values after cleaning
print(df.columns[1])
print("#values: %d"%(clean[df.columns[1]].count()))
print("#NaN: %d"%(clean[df.columns[1]].isnull().sum()))

#show programmes sorted on occurance
Counter(clean[df.columns[1]].dropna()).most_common()

In [None]:
#replace labelled data by integers
clean[df.columns[2]] = df[df.columns[2]].replace({
    'yes':1,
    'no':0,
    'unknown':np.NaN
})

#print number of non-NaN values and number of NaN values after cleaning
print(df.columns[2])
print("#values: %d"%(clean[df.columns[2]].count()))
print("#unknown: %d"%(clean[df.columns[2]].isnull().sum()))

#show answer sorted on occurance
Counter(clean[df.columns[2]].dropna()).most_common()

In [None]:
#replace labelled data by integers
clean[df.columns[3]] = df[df.columns[3]].replace({
    '1':1,
    '0':0,
    'unknown':np.NaN
})

#print number of non-NaN values and number of NaN values after cleaning
print(df.columns[3])
print("#values: %d"%(clean[df.columns[3]].count()))
print("#unknown: %d"%(clean[df.columns[3]].isnull().sum()))

#show answer sorted on occurance
Counter(clean[df.columns[3]].dropna()).most_common()

In [None]:
#replace labelled data by integers
clean[df.columns[4]] = df[df.columns[4]].replace({
    'mu':1,
    'sigma':0,
    'unknown':np.NaN
})

#print number of non-NaN values and number of NaN values after cleaning
print(df.columns[4])
print("#values: %d"%(clean[df.columns[4]].count()))
print("#unknown: %d"%(clean[df.columns[4]].isnull().sum()))

#show answer sorted on occurance
Counter(clean[df.columns[4]].dropna()).most_common()

In [None]:
#replace labelled data by integers
clean[df.columns[5]] = df[df.columns[5]].replace({
    'ja':1,
    'nee':0,
    'unknown':np.NaN
})

#print number of non-NaN values and number of NaN values after cleaning
print(df.columns[5])
print("#values: %d"%(clean[df.columns[5]].count()))
print("#unknown: %d"%(clean[df.columns[5]].isnull().sum()))

#show answer sorted on occurance
Counter(clean[df.columns[5]].dropna()).most_common()

In [None]:
#replace unknown by NaN
clean[df.columns[6]] = df[df.columns[6]].replace({
    'unknown':np.NaN,
    'male':1,
    'female':0
})

#print number of non-NaN values and number of NaN values after cleaning
print(df.columns[6])
print("#values: %d"%(clean[df.columns[6]].count()))
print("#unknown: %d"%(clean[df.columns[6]].isnull().sum()))

#show answer sorted on occurance
Counter(clean[df.columns[6]].dropna()).most_common()

In [None]:
#replace unknown by NaN
clean[df.columns[7]] = df[df.columns[7]].replace({
    'unknown':np.NaN
})

#show answer sorted on occurance
print(df.columns[7])
Counter(clean[df.columns[7]].dropna()).most_common()

In [None]:
def clean_date(data):
    #try to obtain DD-MM dates
    
    #dict to replace month names to month number
    monthtoyear = {'january':1,'february':2,'march':3,'april':4,'may':5,\
                   'june':6,'july':7,'august':8,'september':9,'october':10,\
                   'november':11,'december':12,'aug':8,'februari':2}
    #replace DD/MM, DD.MM or DDth MMM notations
    data = data.str.replace("/","-")
    data = data.str.replace(".","-")
    data = data.str.replace(" ","-")
    data = data.str.replace("th","")
    
    #init empty list to store cleaned dates
    cleaned = []
    
    for i in data.values:
        #try to split string into DD, MM, YYYY
        try:
            date = i.split("-")
        except:
            date = np.NaN
        
        #convert first element, which may be DD, MMM or YYYY
        #e.g. 28-10-1994 or March 24th etc.
        try:
            first = int(date[0])
        except:
            try:
                first = monthtoyear[date[0].lower()]
            except:
                first = np.NaN
        
        #same for second element
        try:
            second = int(date[1])
        except:
            try:
                second = monthtoyear[date[1].lower()]
            except:
                second = np.NaN
        
        #third is always YYYY or DD, never MMM
        try:
            third = int(date[2])
        except:
            third = np.NaN
        
        if second > 12 and first <= 12:
            #assume MM-DD
            cleaned.append("%02d-%02d"%(second,first))
        elif first <= 31 and second <= 12:
            #assume DD-MM
            cleaned.append("%02d-%02d"%(first,second))
        elif first > 1900 and second <= 12 and third <= 31:
            #assume YYYY-MM-DD
            cleaned.append("%02d-%02d"%(third,second))
        elif first > 1900 and second <= 31 and third <= 12:
            #assume YYYY-DD-MM
            cleaned.append("%02d-%02d"%(second,third))
        else:
            #cannot get data, return NaN
            cleaned.append(np.NaN)
            
    cleaned = pd.Series(cleaned)
    return cleaned


#clean dates and print number of (non)-NaN values
print(df.columns[8])
clean[df.columns[8]] = clean_date(df[df.columns[8]])    
print("#values: %d"%(clean[df.columns[8]].count()))
print("#NaN: %d"%(clean[df.columns[8]].isnull().sum()))

#create figure
fig,ax = plt.subplots(figsize=(8,6))

#drop NaN, split on '-' and obtain months only
data = np.array(clean[df.columns[8]].dropna().str.split('-').values.tolist())[:,1]

#plot birth month
ax.hist(data.astype(int),np.arange(0.5,13.5,1),ec='black')
ax.set_title("Distribution of birth month")
ax.set_xlabel("Month")
ax.set_ylabel("Count")

#save histogram
#fig.savefig("dist_month.pdf",bbox_inches='tight')

In [None]:
#clean number of neighbours answers
data = df[df.columns[9]]
data = data.apply(pd.to_numeric,errors='coerce') #non-numeric -> NaN
data = data.mask(data.lt(0) | data.gt(8))        #cannot have more than 8 neighbours!
data = data.round()                              #can only have integer neighbours (error due to NaNs)
clean[df.columns[9]] = data

#print number of (non)-NaN values
print(df.columns[9])
print("#values: %d"%(clean[df.columns[9]].count()))
print("#NaN: %d"%(clean[df.columns[9]].isnull().sum()))

#plot histogram of distribution
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(data.dropna().values, np.arange(-0.5,9.5,1), ec='black');
ax.set_title("Distribution of number of neighbours")
ax.set_xlabel("Neighbours")
ax.set_ylabel("Count")

#print number of occurance
Counter(clean[df.columns[9]].dropna())

In [None]:
#clean column
clean[df.columns[10]] = df[df.columns[10]].replace({
    'yes':1,
    'no':0,
    'unknown':np.NaN
})

#print number of (non)-NaN values
print(df.columns[10])
print("#values: %d"%(clean[df.columns[10]].count()))
print("#NaN: %d"%(clean[df.columns[10]].isnull().sum()))

#print number of occurances
Counter(clean[df.columns[10]].dropna())

In [None]:
#clean column
data = df[df.columns[11]]
data = data.apply(pd.to_numeric,errors='coerce') #convert values to numerical
data = data.mask(data.lt(0) | data.gt(100))      #can only be between 0 and 100
data = data.round(2)                             #can only get money rounded to two decimals
clean[df.columns[11]] = data

#print number of (non)-NaN values
print(df.columns[11])
print("#values: %d"%(clean[df.columns[11]].count()))
print("#NaN: %d"%(clean[df.columns[11]].isnull().sum()))

#plot distribution between £0,00 and £100,-
fig,ax = plt.subplots(figsize=(8,6))
ax.hist(data.dropna().values,np.arange(-2.5,107.5,5), ec='black')
ax.set_title("Distribution of money gained")

#plot distribution between £0,00 and £5,-
fig,ax = plt.subplots(figsize=(8,6))
ax.hist(data.dropna().values, np.arange(-0.125,5.375,0.25), ec='black')
ax.set_title("Distribution of money gained")

#show occurances
Counter(data.dropna())

Cleaning random number column and plotting histogram of numbers 0-10

In [None]:
#https://stackoverflow.com/a/34844867 convert whole column to numeric
#https://stackoverflow.com/a/41618665 mask data to NaN if outside some range

#clean column
data = df[df.columns[12]]
data = data.str.replace('ACHT','8')
data = data.apply(pd.to_numeric,errors='coerce') #non-numeric -> NaN
data = data.mask(data.lt(0) | data.gt(10))       #can only be 0 - 10
cleaned = []
for i in data.values:
    if np.isnan(i):
        cleaned.append(np.NaN)
    else:
        cleaned.append(i)
data = pd.Series(cleaned)
clean[df.columns[12]] = data

#print number of (non)-NaN values
print(df.columns[12])
print("#values: %d"%(clean[df.columns[12]].count()))
print("#NaN: %d"%(clean[df.columns[12]].isnull().sum()))

#plot distribution and save to file
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(data.dropna().values, np.arange(-0.5,11.5,1), ec='black');
ax.set_title("Random number distribution")
ax.set_xlabel("Value")
ax.set_ylabel("Count")
fig.savefig("dist_random.pdf",bbox_inches='tight')

#print occurance
Counter(data.dropna())

In [None]:
#https://stackoverflow.com/a/15321222 decoding unicode in string

data = df[df.columns[13]]
data = data.str.decode('unicode_escape').str.encode('ascii','ignore').str.decode('ascii')
data = data.str.lower()
data = data.str.replace(' ','')
data = data.str.replace('a.m.','')
data = data.str.replace('am','')
data = data.str.replace('pm','')
data = data.str.replace('.',':')
data = data.str.replace('300','3:00')
data = data.str.replace('2330','23:30')
data = data.str.replace('2359','23:59')
cleaned = []
for i in data.values:
    
    #split hour:minute
    time = i.split(':')
    
    #answered in hours or without :
    if len(time) == 1:
        try:
            hour = int(time[0])
            minute = 0
        except:
            hour = np.NaN
            minute = np.NaN
    
    #answered in hour:minute
    else:
        hour = int(time[0])
        minute = int(time[1])
    
    #assume 12-hour notation if time to bed between 8-12
    if hour >= 8 and hour <= 12:
        hour += 12
    if hour == 24:
        hour -= 24
    
    #sanity check
    if hour > 24 or minute > 59:
        hour = np.NaN
        minute = np.NaN
    
    if np.isnan(hour):
        cleaned.append(np.NaN)
    else:
        #store offset from 00:00 in hours
        time = np.round(hour+minute/60)%24
        if time > 6:
            time -= 24
        cleaned.append(time)
clean[df.columns[13]] = pd.Series(cleaned)

print(df.columns[13])
print("#values: %d"%(clean[df.columns[13]].count()))
print("#NaN: %d"%(clean[df.columns[13]].isnull().sum()))

#plot distribution and save to file
fig,ax = plt.subplots(figsize=(8,6))
ax.hist(clean[df.columns[13]].dropna(),np.arange(-4.5,6.5,1),ec='black')
ax.set_title("Time to bed yesterday (hour)")
ax.set_xlabel("Hour relative to 00:00")
ax.set_ylabel("Count")
fig.savefig("dist_timetobed.pdf",bbox_inches='tight')

x = Counter(pd.Series(cleaned).dropna())
sorted(x.items())

# Cleaned data

In [None]:
clean.head(25)

# Basic regression/classification

https://towardsdatascience.com/solving-a-simple-classification-problem-with-python-fruits-lovers-edition-d20ab6b071d2

## Predicting programme from having taken other courses

In [None]:
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn import metrics

In [None]:
feature_names = ['Have you taken a course on machine learning?',\
                 'Have you taken a course on information retrieval?',\
                 'Have you taken a course on statistics?',\
                 'Have you taken a course on databases?']
classification_name = 'What programme are you in?'

#only take 5 most occuring programmes
subset = clean.loc[clean['What programme are you in?'].isin(['Artificial Intelligence',\
                                                    'Business Analytics',\
                                                    'Bioinformatics & Systems Biology',
                                                    'Computational Science',
                                                    'Computer Science'])]
#drop NaN
subset = subset[feature_names+[classification_name]].dropna()

#split subset into features and target
X = subset[feature_names]
y = subset[classification_name]

In [None]:
from sklearn.linear_model import LogisticRegression

#https://stackoverflow.com/a/39169661
logreg=LogisticRegression()
predicted = cross_val_predict(logreg, X, y, cv=4)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
predicted = cross_val_predict(clf,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
predicted = cross_val_predict(knn,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
predicted = cross_val_predict(lda,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
predicted = cross_val_predict(gnb,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.svm import SVC

svm = SVC()
predicted = cross_val_predict(svm,X,y,cv=4)

print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

## Predict chocolate answer

In [None]:
feature_names = ['What programme are you in?',\
                 'Time you went to be Yesterday',\
                 'What is your gender?']
classification_name = 'Chocolate makes you.....'

subset = clean[feature_names+[classification_name]].dropna()

subset = subset.loc[subset['What programme are you in?'].isin(['Artificial Intelligence',\
                                                    'Business Analytics',\
                                                    'Bioinformatics & Systems Biology',
                                                    'Computational Science',
                                                    'Computer Science'])]

#remove slim answer, as dataset too small
subset = subset.loc[~subset['Chocolate makes you.....'].isin(['slim'])]

prog_enc = {
                         'Artificial Intelligence':0,\
                         'Business Analytics':1,\
                         'Bioinformatics & Systems Biology':2,\
                         'Computational Science':3,\
                         'Computer Science':4}

subset[feature_names[0]] = subset[feature_names[0]].replace(prog_enc)


X = subset[feature_names]
y = subset[classification_name]

In [None]:
from sklearn.linear_model import LogisticRegression

#https://stackoverflow.com/a/39169661
logreg=LogisticRegression()
predicted = cross_val_predict(logreg, X, y, cv=10)
print(metrics.classification_report(y, predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
predicted = cross_val_predict(clf,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
predicted = cross_val_predict(knn,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
predicted = cross_val_predict(lda,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
predicted = cross_val_predict(gnb,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.svm import SVC

svm = SVC()
predicted = cross_val_predict(svm,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

## Predict greediness based on programme and gender

In [None]:
feature_names = ['What programme are you in?',
                 'What is your gender?']
classification_name = clean.columns[11]

subset = clean[feature_names+[classification_name]].dropna()

subset.loc[subset[classification_name]<=1,classification_name] = 0 #not greedy
subset.loc[subset[classification_name]>1,classification_name] = 1  #greedy

subset = subset.loc[subset['What programme are you in?'].isin(['Artificial Intelligence',\
                                                    'Business Analytics',\
                                                    'Bioinformatics & Systems Biology',
                                                    'Computational Science',
                                                    'Computer Science'])]
prog_enc = {
                         'Artificial Intelligence':0,\
                         'Business Analytics':1,\
                         'Bioinformatics & Systems Biology':2,\
                         'Computational Science':3,\
                         'Computer Science':4}

subset[feature_names[0]] = subset[feature_names[0]].replace(prog_enc)

X = subset[feature_names]
y = subset[classification_name]

In [None]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()
predicted = cross_val_predict(logreg, X, y, cv=10)
print(metrics.classification_report(y, predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
predicted = cross_val_predict(clf,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
predicted = cross_val_predict(knn,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
predicted = cross_val_predict(lda,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
predicted = cross_val_predict(gnb,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))

In [None]:
from sklearn.svm import SVC

svm = SVC()
predicted = cross_val_predict(svm,X,y,cv=10)
print(metrics.classification_report(y,predicted))

print("Accuracy score: %.2f"%metrics.accuracy_score(y,predicted))
print("Cohen Kappa score: %.2f"%metrics.cohen_kappa_score(y,predicted))
print("Confusion matrix:")
print(metrics.confusion_matrix(y,predicted))