<a href="https://colab.research.google.com/github/Jming9638/MachineLearning/blob/main/PythonClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

#Data Loader

In [None]:
from google.colab import auth
import gspread
from google.auth import default

auth.authenticate_user()
creds, _ = default()

gc = gspread.authorize(creds)
sh = gc.open('Race and Gender').worksheet('gender')
rows = sh.get_all_values()
gender = pd.DataFrame.from_records(data=rows[1:], columns=rows[0])
gender = gender.drop_duplicates()
gender['Gender'].value_counts()

Female    8995
Male      4369
Name: Gender, dtype: int64

In [None]:
gc = gspread.authorize(creds)
sh = gc.open('Race and Gender').worksheet('race')
rows = sh.get_all_values()
race = pd.DataFrame.from_records(data=rows[1:], columns=rows[0])
race = race.drop_duplicates()
race['Race'].value_counts()

Chinese    6072
Malay      3119
Others     3114
Indian     2236
Name: Race, dtype: int64

#Gender

##CountVectorizer

In [None]:
X = gender['Name']
y = gender['Gender']

vec = CountVectorizer(analyzer='char', ngram_range=(2, 5), lowercase=True)
X = vec.fit_transform(X)

random_sampling = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = random_sampling.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

###LogisticRegression

In [None]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 84.1%
[[1100  213]
 [ 204 1105]]
              precision    recall  f1-score   support

      Female       0.84      0.84      0.84      1313
        Male       0.84      0.84      0.84      1309

    accuracy                           0.84      2622
   macro avg       0.84      0.84      0.84      2622
weighted avg       0.84      0.84      0.84      2622



In [None]:
# algorithm = 'Logistic Regression'
# accuracy = accuracy_score(y_test, y_pred)

# with open("gender_prediction.pkl", "wb") as file:
#     pickle.dump((model, algorithm, vec, accuracy), file)

###DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 75.82%
[[ 965  348]
 [ 286 1023]]
              precision    recall  f1-score   support

      Female       0.77      0.73      0.75      1313
        Male       0.75      0.78      0.76      1309

    accuracy                           0.76      2622
   macro avg       0.76      0.76      0.76      2622
weighted avg       0.76      0.76      0.76      2622



###RandomForestClassifier

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 83.1%
[[1084  229]
 [ 214 1095]]
              precision    recall  f1-score   support

      Female       0.84      0.83      0.83      1313
        Male       0.83      0.84      0.83      1309

    accuracy                           0.83      2622
   macro avg       0.83      0.83      0.83      2622
weighted avg       0.83      0.83      0.83      2622



###NaiveBayes

In [None]:
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

y_pred = model.predict(X_test.toarray())

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 76.54%
[[1052  261]
 [ 354  955]]
              precision    recall  f1-score   support

      Female       0.75      0.80      0.77      1313
        Male       0.79      0.73      0.76      1309

    accuracy                           0.77      2622
   macro avg       0.77      0.77      0.77      2622
weighted avg       0.77      0.77      0.77      2622



###KNN

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 64.84%
[[947 366]
 [556 753]]
              precision    recall  f1-score   support

      Female       0.63      0.72      0.67      1313
        Male       0.67      0.58      0.62      1309

    accuracy                           0.65      2622
   macro avg       0.65      0.65      0.65      2622
weighted avg       0.65      0.65      0.65      2622



##TfidfVectorizer

In [None]:
X = gender['Name']
y = gender['Gender']

vec = TfidfVectorizer(analyzer='char', ngram_range=(2, 5), lowercase=True)
X = vec.fit_transform(X)

random_sampling = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = random_sampling.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

###LogisticRegression

In [None]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 84.06%
[[1080  233]
 [ 185 1124]]
              precision    recall  f1-score   support

      Female       0.85      0.82      0.84      1313
        Male       0.83      0.86      0.84      1309

    accuracy                           0.84      2622
   macro avg       0.84      0.84      0.84      2622
weighted avg       0.84      0.84      0.84      2622



###DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 73.34%
[[956 357]
 [342 967]]
              precision    recall  f1-score   support

      Female       0.74      0.73      0.73      1313
        Male       0.73      0.74      0.73      1309

    accuracy                           0.73      2622
   macro avg       0.73      0.73      0.73      2622
weighted avg       0.73      0.73      0.73      2622



###RandomForestClassifier

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 83.1%
[[1050  263]
 [ 180 1129]]
              precision    recall  f1-score   support

      Female       0.85      0.80      0.83      1313
        Male       0.81      0.86      0.84      1309

    accuracy                           0.83      2622
   macro avg       0.83      0.83      0.83      2622
weighted avg       0.83      0.83      0.83      2622



###NaiveBayes

In [None]:
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

y_pred = model.predict(X_test.toarray())

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 75.1%
[[993 320]
 [333 976]]
              precision    recall  f1-score   support

      Female       0.75      0.76      0.75      1313
        Male       0.75      0.75      0.75      1309

    accuracy                           0.75      2622
   macro avg       0.75      0.75      0.75      2622
weighted avg       0.75      0.75      0.75      2622



###KNN

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 77.23%
[[ 994  319]
 [ 278 1031]]
              precision    recall  f1-score   support

      Female       0.78      0.76      0.77      1313
        Male       0.76      0.79      0.78      1309

    accuracy                           0.77      2622
   macro avg       0.77      0.77      0.77      2622
weighted avg       0.77      0.77      0.77      2622



#Race

##CountVectorizer

In [None]:
X = race['Name']
y = race['Race']

vec = CountVectorizer(analyzer='char', ngram_range=(2, 5), lowercase=True)
X = vec.fit_transform(X)

random_sampling = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = random_sampling.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

###LogisticRegression

In [None]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 78.28%
[[655   8  13  11]
 [ 14 497  35 108]
 [ 10  33 524  95]
 [ 33 127  96 425]]
              precision    recall  f1-score   support

     Chinese       0.92      0.95      0.94       687
      Indian       0.75      0.76      0.75       654
       Malay       0.78      0.79      0.79       662
      Others       0.67      0.62      0.64       681

    accuracy                           0.78      2684
   macro avg       0.78      0.78      0.78      2684
weighted avg       0.78      0.78      0.78      2684



###DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 67.66%
[[599  24  24  40]
 [ 26 411  54 163]
 [ 22  57 482 101]
 [ 77 155 125 324]]
              precision    recall  f1-score   support

     Chinese       0.83      0.87      0.85       687
      Indian       0.64      0.63      0.63       654
       Malay       0.70      0.73      0.72       662
      Others       0.52      0.48      0.50       681

    accuracy                           0.68      2684
   macro avg       0.67      0.68      0.67      2684
weighted avg       0.67      0.68      0.67      2684



###RandomForestClassifier

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 76.45%
[[657   8  11  11]
 [ 21 508  37  88]
 [ 22  32 555  53]
 [ 68 179 102 332]]
              precision    recall  f1-score   support

     Chinese       0.86      0.96      0.90       687
      Indian       0.70      0.78      0.74       654
       Malay       0.79      0.84      0.81       662
      Others       0.69      0.49      0.57       681

    accuracy                           0.76      2684
   macro avg       0.76      0.76      0.76      2684
weighted avg       0.76      0.76      0.76      2684



###NaiveBayes

In [None]:
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

y_pred = model.predict(X_test.toarray())

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 76.94%
[[635  16  13  23]
 [  7 494  51 102]
 [ 10  25 532  95]
 [ 27 149 101 404]]
              precision    recall  f1-score   support

     Chinese       0.94      0.92      0.93       687
      Indian       0.72      0.76      0.74       654
       Malay       0.76      0.80      0.78       662
      Others       0.65      0.59      0.62       681

    accuracy                           0.77      2684
   macro avg       0.77      0.77      0.77      2684
weighted avg       0.77      0.77      0.77      2684



###KNN

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 56.74%
[[649   8  11  19]
 [200 321  57  76]
 [144  58 399  61]
 [237 167 123 154]]
              precision    recall  f1-score   support

     Chinese       0.53      0.94      0.68       687
      Indian       0.58      0.49      0.53       654
       Malay       0.68      0.60      0.64       662
      Others       0.50      0.23      0.31       681

    accuracy                           0.57      2684
   macro avg       0.57      0.57      0.54      2684
weighted avg       0.57      0.57      0.54      2684



##TfidfVectorizer

In [None]:
X = race['Name']
y = race['Race']

vec = TfidfVectorizer(analyzer='char', ngram_range=(2, 5), lowercase=True)
X = vec.fit_transform(X)

random_sampling = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = random_sampling.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

###LogisticRegression

In [None]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 79.36%
[[648   7  11  21]
 [  8 491  30 125]
 [ 11  24 533  94]
 [ 32 111  80 458]]
              precision    recall  f1-score   support

     Chinese       0.93      0.94      0.94       687
      Indian       0.78      0.75      0.76       654
       Malay       0.81      0.81      0.81       662
      Others       0.66      0.67      0.66       681

    accuracy                           0.79      2684
   macro avg       0.79      0.79      0.79      2684
weighted avg       0.79      0.79      0.79      2684



###DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 66.92%
[[583  28  30  46]
 [ 25 429  56 144]
 [ 28  47 471 116]
 [ 62 168 138 313]]
              precision    recall  f1-score   support

     Chinese       0.84      0.85      0.84       687
      Indian       0.64      0.66      0.65       654
       Malay       0.68      0.71      0.69       662
      Others       0.51      0.46      0.48       681

    accuracy                           0.67      2684
   macro avg       0.66      0.67      0.67      2684
weighted avg       0.66      0.67      0.67      2684



###RandomForestClassifier

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 78.58%
[[656   9   6  16]
 [ 15 481  31 127]
 [ 21  33 541  67]
 [ 42 120  88 431]]
              precision    recall  f1-score   support

     Chinese       0.89      0.95      0.92       687
      Indian       0.75      0.74      0.74       654
       Malay       0.81      0.82      0.81       662
      Others       0.67      0.63      0.65       681

    accuracy                           0.79      2684
   macro avg       0.78      0.79      0.78      2684
weighted avg       0.78      0.79      0.78      2684



In [None]:
# algorithm = 'Random Forest Classifier'
# accuracy = accuracy_score(y_test, y_pred)

# with open("race_prediction.pkl", "wb") as file:
#     pickle.dump((model, algorithm, vec, accuracy), file)

###NaiveBayes

In [None]:
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

y_pred = model.predict(X_test.toarray())

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 75.78%
[[616  19  14  38]
 [  8 481  51 114]
 [  9  30 516 107]
 [ 26 139  95 421]]
              precision    recall  f1-score   support

     Chinese       0.93      0.90      0.92       687
      Indian       0.72      0.74      0.73       654
       Malay       0.76      0.78      0.77       662
      Others       0.62      0.62      0.62       681

    accuracy                           0.76      2684
   macro avg       0.76      0.76      0.76      2684
weighted avg       0.76      0.76      0.76      2684



###KNN

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Model accuracy: {score}%'.format(score=round(accuracy_score(y_test, y_pred) * 100, 2)))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model accuracy: 75.48%
[[655  14  10   8]
 [ 32 515  48  59]
 [ 23  41 568  30]
 [ 85 155 153 288]]
              precision    recall  f1-score   support

     Chinese       0.82      0.95      0.88       687
      Indian       0.71      0.79      0.75       654
       Malay       0.73      0.86      0.79       662
      Others       0.75      0.42      0.54       681

    accuracy                           0.75      2684
   macro avg       0.75      0.76      0.74      2684
weighted avg       0.75      0.75      0.74      2684

