In [1]:

import pandas

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

from sklearn.cluster import KMeans

from sklearn.metrics.pairwise import pairwise_distances_argmin

from sklearn.metrics import accuracy_score

In [2]:

#Task 1
#( a )

dataset = pandas.read_csv( "income.csv" )

dataset.head()

Unnamed: 0,income,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week
0,0,39,State-gov,Bachelors,NotMarried,Adm-clerical,Not-in-family,White,Male,40
1,0,50,Self-emp-not-inc,Bachelors,Married,Exec-managerial,Husband,White,Male,13
2,0,38,Private,HS-grad,Separated,Handlers-cleaners,Not-in-family,White,Male,40
3,0,53,Private,11th,Married,Handlers-cleaners,Husband,Black,Male,40
4,0,28,Private,Bachelors,Married,Prof-specialty,Wife,Black,Female,40


In [3]:

dataset.isna().sum()

income               0
age                  0
workclass         1396
education            0
marital-status       0
occupation        1401
relationship         0
race                 0
sex                  0
hours-per-week       0
dtype: int64

In [4]:

dataset.dtypes

income             int64
age                int64
workclass         object
education         object
marital-status    object
occupation        object
relationship      object
race              object
sex               object
hours-per-week     int64
dtype: object

In [5]:

dataset.duplicated().any()

True

In [6]:

dataset.shape

(26215, 10)

In [7]:

print("dataset length:", len(dataset))

dataset length: 26215


In [8]:

#( b )

dataset = dataset.dropna()
#print("dataset length:", len(dataset))

dataset.shape

(24814, 10)

In [9]:

#( c )

dataset = dataset.drop_duplicates()

dataset.shape

(21537, 10)

In [10]:

#( d )

dataset['sex'] = dataset['sex'].replace({'Male': 0, 'Female': 1})

dataset.head()

  dataset['sex'] = dataset['sex'].replace({'Male': 0, 'Female': 1})


Unnamed: 0,income,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week
0,0,39,State-gov,Bachelors,NotMarried,Adm-clerical,Not-in-family,White,0,40
1,0,50,Self-emp-not-inc,Bachelors,Married,Exec-managerial,Husband,White,0,13
2,0,38,Private,HS-grad,Separated,Handlers-cleaners,Not-in-family,White,0,40
3,0,53,Private,11th,Married,Handlers-cleaners,Husband,Black,0,40
4,0,28,Private,Bachelors,Married,Prof-specialty,Wife,Black,1,40


In [11]:

dataset['education'] = dataset['education'].replace({'Preschool' : 0, \
                                                     '1st-4th': 1, \
                                                     '5th-6th': 2, \
                                                     '7th-8th': 3, \
                                                     '9th': 4, \
                                                     '10th': 5, \
                                                     '11th': 6, \
                                                     '12th': 7, \
                                                     'HS-grad': 8, \
                                                     'Some-college': 9, \
                                                     'Assoc-voc' : 10, \
                                                     'Assoc-acdm' : 11, \
                                                     'Assoc-acdm' : 12, \
                                                     'Bachelors' : 13, \
                                                     'Masters' : 14, \
                                                     'Prof-school' : 15, \
                                                     'Doctorate' : 16})

dataset.head()

  dataset['education'] = dataset['education'].replace({'Preschool' : 0, \


Unnamed: 0,income,age,workclass,education,marital-status,occupation,relationship,race,sex,hours-per-week
0,0,39,State-gov,13,NotMarried,Adm-clerical,Not-in-family,White,0,40
1,0,50,Self-emp-not-inc,13,Married,Exec-managerial,Husband,White,0,13
2,0,38,Private,8,Separated,Handlers-cleaners,Not-in-family,White,0,40
3,0,53,Private,6,Married,Handlers-cleaners,Husband,Black,0,40
4,0,28,Private,13,Married,Prof-specialty,Wife,Black,1,40


In [12]:

dataset = pandas.get_dummies( dataset, columns = [ 'workclass', 'marital-status', 'occupation', \
                                                          'relationship', 'race'], drop_first=True )

In [13]:

dataset.head()

dataset.shape

(21537, 36)

In [14]:

#( e )

converted_numpy_array = dataset.values
input_variables = converted_numpy_array[:, 1:]
target_variable = converted_numpy_array[:, 0]

In [15]:

X_train, X_test, y_train, y_test = train_test_split( input_variables, target_variable, test_size=0.1, random_state=1 )

In [16]:

#( f )

normalised_dataset = MinMaxScaler().fit( X_train )

X_train_normalised = normalised_dataset.transform( X_train )

X_test_normalised = normalised_dataset.transform( X_test )

In [17]:

#Task 2
#( a )

y_train = y_train.astype(int)

y_test = y_test.astype(int)

logistic_regression_model = LogisticRegression()

logistic_regression_model.fit( X_train_normalised, y_train)

In [18]:

support_vector_machine_model = SVC()
support_vector_machine_model.fit( X_train_normalised, y_train )

In [19]:

#( b )

kfold = KFold(n_splits=10, shuffle=True, random_state=2)

results = cross_val_score( logistic_regression_model, X_train_normalised, y_train, cv=kfold )

print("Average Accuracy of Logistic regression model is",results.mean())

results = cross_val_score( support_vector_machine_model, X_train_normalised, y_train, cv=kfold )

print("Average Accuracy of Support vector machine model is",results.mean())

Average Accuracy of Logistic regression model is 0.8073043619880025
Average Accuracy of Support vector machine model is 0.8042087060931156


In [20]:

#( c )

grid_params_lr = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10],
    'solver': ['saga', 'liblinear']
}

lr = LogisticRegression(max_iter=150)
gs_lr_result = GridSearchCV(lr, grid_params_lr, cv=kfold).fit(X_train_normalised, y_train)
print(gs_lr_result.best_score_)



0.8074589744695141


In [21]:

grid_params_svc = {
    'kernel': ['linear', 'poly'],
    'C': [1, 10],
    'degree': [3, 8],
    'gamma': ['auto','scale']
}

svc = SVC()
gs_svc_result = GridSearchCV(svc, grid_params_svc, cv=kfold).fit(X_train_normalised, y_train)
print(gs_svc_result.best_score_)

0.8079752364559732


In [22]:

#( d )

test_accuracy = gs_lr_result.best_estimator_.score(X_test_normalised, y_test)
print("Accuracy in testing:", test_accuracy)

gs_lr_result.best_params_

Accuracy in testing: 0.7887650882079852


{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

In [23]:

test_accuracy = gs_svc_result.best_estimator_.score(X_test_normalised, y_test)
print("Accuracy in testing:", test_accuracy)

gs_lr_result.best_params_

Accuracy in testing: 0.7859795728876509


{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

In [24]:

#( 3 )
#( a )

kmeans = KMeans(n_clusters=2, random_state=0).fit( X_train_normalised )

In [25]:

#( b )

cluster_labels = kmeans.labels_

cluster_counts = np.bincount( cluster_labels )

count = 1
for index in cluster_counts:
    print( f'In cluster {count} there are {index} samples' )
    count = count + 1

In cluster 1 there are 5781 samples
In cluster 2 there are 13602 samples


In [26]:

#( c )

kmeans_cluster_centers = kmeans.cluster_centers_
closest = pairwise_distances_argmin(kmeans.cluster_centers_, X_train_normalised )

dataset.iloc[closest, :]

Unnamed: 0,income,age,education,sex,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,occupation_Transport-moving,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White
3378,0,43,9,0,40,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
16016,0,69,2,0,40,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [33]:

#( d )

cluster_labels_for_testing = kmeans.predict( X_test_normalised )

accuracy = accuracy_score( y_test, cluster_labels_for_testing )
print("k means prediction accuracy:", accuracy)

k means prediction accuracy: 0.3867223769730734
