In [50]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import preprocessing, metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
sns.set(style="white", color_codes=True)
import warnings
warnings.filterwarnings("ignore")


In [51]:
df= pd.read_csv(r"CC GENERAL.csv")
df.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


In [52]:
df['TENURE'].value_counts()
df.fillna(df.mean(), inplace=True)#replacing the null/Empty values with the mean

In [53]:
X = df.drop('TENURE',axis=1).values
y = df['TENURE'].values

In [54]:
#Getting the shape of X and Y
x= df.iloc[:,1:-1]
y = df.iloc[:,-1]
print(x.shape,y.shape)

(8950, 16) (8950,)


In [55]:
#a. Apply PCA on CC dataset.
pca = PCA(3)
x_pca = pca.fit_transform(x)
principalDf = pd.DataFrame(data = x_pca, columns = ['principal component 1', 'principal component 2', 'principal component 3'])
finalDf = pd.concat([principalDf, df.iloc[:,-1]], axis = 1)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,TENURE
0,-4326.383979,921.566882,183.708383,12
1,4118.916665,-2432.846346,2369.969289,12
2,1497.907641,-1997.578694,-2125.631328,12
3,1394.548536,-1488.743453,-2431.799649,12
4,-3743.351896,757.342657,512.476492,12


In [56]:
#Apply k-means algorithm on the PCA result and report your observation if the silhouette score has
#improved or not?
X = finalDf.iloc[:,0:-1]
y = finalDf.iloc[:,-1]

nclusters = 3 # this is the k in kmeans
km = KMeans(n_clusters=nclusters)
km.fit(X)


y_cluster_kmeans = km.predict(X)

print(classification_report(y, y_cluster_kmeans, zero_division=1))
print(confusion_matrix(y, y_cluster_kmeans))

#finding the accuracy
train_accuracy = accuracy_score(y, y_cluster_kmeans)
print("\nAccuracy for our Training dataset with PCA:", train_accuracy)

#Calculating silhouette Score
Silscore = metrics.silhouette_score(X, y_cluster_kmeans)
print("Sihouette Score: ",Silscore) 

              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       0.00      1.00      0.00       0.0
           2       0.00      1.00      0.00       0.0
           6       1.00      0.00      0.00     204.0
           7       1.00      0.00      0.00     190.0
           8       1.00      0.00      0.00     196.0
           9       1.00      0.00      0.00     175.0
          10       1.00      0.00      0.00     236.0
          11       1.00      0.00      0.00     365.0
          12       1.00      0.00      0.00    7584.0

    accuracy                           0.00    8950.0
   macro avg       0.70      0.30      0.00    8950.0
weighted avg       1.00      0.00      0.00    8950.0

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [ 175   28    1    0    0    0    0    0    0    0]
 [ 173   15    2    0    0   

In [57]:
#Perform Scaling+PCA+K-Means and report performance.
x = df.iloc[:,1:-1]
y = df.iloc[:,-1]
print(x.shape,y.shape)

(8950, 16) (8950,)


In [58]:
# Scale the dataset
scaler = StandardScaler()
scaler.fit(x)
X_scaled_array = scaler.transform(x)

In [59]:
# Instantiate PCA
pca = PCA(3)
x_pca = pca.fit_transform(X_scaled_array)
principalDf = pd.DataFrame(data = x_pca, columns = ['principal component 1', 'principal component 2','principal component 3'])
finalDf = pd.concat([principalDf, df.iloc[:,-1]], axis = 1)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3,TENURE
0,-1.718893,-1.072939,0.53575,12
1,-1.169306,2.509314,0.627247,12
2,0.938414,-0.382597,0.161506,12
3,-0.907503,0.045855,1.521143,12
4,-1.63783,-0.684972,0.426046,12


In [60]:
# this is the k in kmeans
x = finalDf.iloc[:,0:-1]
y = finalDf["TENURE"]
print(X.shape,y.shape)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.34,random_state=0)
nclusters = 3 
km = KMeans(n_clusters=nclusters)
km.fit(X_train,y_train)


(8950, 3) (8950,)


In [61]:
# predict the cluster for each training data point
y_clus_train = km.predict(X_train)

# Summary of the predictions made by the classifier
print(classification_report(y_train, y_clus_train, zero_division=1))
print(confusion_matrix(y_train, y_clus_train))

train_accuracy = accuracy_score(y_train, y_clus_train)
print("Accuracy for our Training dataset with PCA:", train_accuracy)

#Calculating sihouette Score
score = metrics.silhouette_score(X_train, y_clus_train)
print("Sihouette Score: ",score)   #ranges from -1 to +1, high value shows that it is matched more

              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       0.00      1.00      0.00       0.0
           2       0.00      1.00      0.00       0.0
           6       1.00      0.00      0.00     139.0
           7       1.00      0.00      0.00     135.0
           8       1.00      0.00      0.00     128.0
           9       1.00      0.00      0.00     118.0
          10       1.00      0.00      0.00     151.0
          11       1.00      0.00      0.00     262.0
          12       1.00      0.00      0.00    4974.0

    accuracy                           0.00    5907.0
   macro avg       0.70      0.30      0.00    5907.0
weighted avg       1.00      0.00      0.00    5907.0

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   4   30  105    0    0    0    0    0    0    0]
 [   1   26  108    0    0   

In [62]:
# predict the cluster for each testing data point
y_clus_test = km.predict(X_test)

# Summary of the predictions made by the classifier
print(classification_report(y_test, y_clus_test, zero_division=1))
print(confusion_matrix(y_test, y_clus_test))

train_accuracy = accuracy_score(y_test, y_clus_test)
print("\nAccuracy for our Training dataset with PCA:", train_accuracy)

#Calculating sihouette Score
score = metrics.silhouette_score(X_test, y_clus_test)
print("Sihouette Score: ",score)   #ranges from -1 to +1, high value shows that it is matched more

              precision    recall  f1-score   support

           0       0.00      1.00      0.00       0.0
           1       0.00      1.00      0.00       0.0
           2       0.00      1.00      0.00       0.0
           6       1.00      0.00      0.00      65.0
           7       1.00      0.00      0.00      55.0
           8       1.00      0.00      0.00      68.0
           9       1.00      0.00      0.00      57.0
          10       1.00      0.00      0.00      85.0
          11       1.00      0.00      0.00     103.0
          12       1.00      0.00      0.00    2610.0

    accuracy                           0.00    3043.0
   macro avg       0.70      0.30      0.00    3043.0
weighted avg       1.00      0.00      0.00    3043.0

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   3   21   41    0    0    0    0    0    0    0]
 [   0   12   43    0    0   

In [63]:
#2. Use pd_speech_features.csv
#a. Perform Scaling
#b. Apply PCA (k=3)
#c. Use SVM to report performance

In [64]:
df_pd = pd.read_csv(r"pd_speech_features.csv")

In [65]:
df_pd.isnull().any()
X = df_pd.drop('class',axis=1).values
Y = df_pd['class'].values

#Perform Scaling
scaler = StandardScaler()
X_Scale = scaler.fit_transform(X)


In [66]:
#Apply PCA (k=3)
pca3 = PCA(n_components=3)
principalComponents = pca3.fit_transform(X_Scale)

principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2','Principal Component 3'])

finalDf = pd.concat([principalDf, df_pd[['class']]], axis = 1)
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,Principal Component 3,class
0,-10.047372,1.471079,-6.846406,1
1,-10.637725,1.58375,-6.830978,1
2,-13.516185,-1.25354,-6.818699,1
3,-9.155084,8.833603,15.290899,1
4,-6.76447,4.61147,15.637118,1


In [67]:
X = finalDf.drop('class',axis=1).values
Y = finalDf['class'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=0)

In [69]:
#Use SVM to report performance
from sklearn.svm import SVC

svmClassifier = SVC()
svmClassifier.fit(X_train, Y_train)

y_pred = svmClassifier.predict(X_test)

# Summary of the predictions made by the classifier
print(classification_report(Y_test, y_pred, zero_division=1))
print(confusion_matrix(Y_test, y_pred))
# Accuracy score
glass_acc_svc = accuracy_score(y_pred,Y_test)
print('accuracy is',glass_acc_svc)

#Calculate sihouette Score
score = metrics.silhouette_score(X_test, y_pred)
print("Sihouette Score: ",score) 

              precision    recall  f1-score   support

           0       0.67      0.42      0.52        57
           1       0.83      0.93      0.88       170

    accuracy                           0.80       227
   macro avg       0.75      0.68      0.70       227
weighted avg       0.79      0.80      0.79       227

[[ 24  33]
 [ 12 158]]
accuracy is 0.801762114537445
Sihouette Score:  0.25432088725849344


In [70]:
#3. Apply Linear Discriminant Analysis (LDA) on Iris.csv dataset to reduce dimensionality of data tok=2.

In [71]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
df_iris = pd.read_csv(r"Iris.csv")

df_iris.isnull().any()

x = df_iris.iloc[:,1:-1]
y = df_iris.iloc[:,-1]
print(x.shape,y.shape)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
le = LabelEncoder()
y = le.fit_transform(y)



(150, 4) (150,)


In [72]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)
print(X_train.shape,X_test.shape)

(105, 2) (45, 2)


In [None]:
#Briefly identify the difference between PCA and LDA
PCA(Principal Component Analysis):
  2.PCA is an unsupervised algorithm that does not care about classes and labels and only aims to find the principal 
  components to maximize the variance in the given dataset.
  3.PCA is assumed to be an as good performer for a comparatively small sample size.
LDA(Linear Discriminant Analysis):
  1.LDA is a supervised algorithm that aims to find the linear discriminants to represent the axes that maximize 
  separation between different classes of data.
  2.suitable for multi-class classification tasks.