## Initialize

In [None]:
import numpy as np
import pandas as pd
import re # regular expression

#前處理
from collections import Counter
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import LabelEncoder
# from random import randrange, uniform
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.datasets import make_classification

# 分類器
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.metrics import accuracy_score ,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


## Functions


### a. Data Preprocessing

In [None]:
columns = ['height', 'weight', 'sleepiness', 'iq', 'fb_friends', 'yt']

def data_preprocess(df):

  # 刪除沒有要使用的部分
  df = df.drop(['star_sign'], axis=1)

  df = missing_value_inputation(df)

  df = change_outlier(df)

  # self_intro的文字處理
  df = self_intro_pre(df)

  return df


def missing_value_inputation(df):

  # 補眾數+label_encoder
  df['phone_os'] = df['phone_os'].fillna('Android')

  labelencoder = LabelEncoder()
  df['phone_os'] = labelencoder.fit_transform(df['phone_os'])

  df['yt'] = pd.to_numeric(df['yt'], errors='coerce')
  df = median_inputation(df)

  return df

def median_inputation(df):

  for column in columns:
    df[column] = df[column].fillna(df[column].median())

  return df


def change_outlier(df):

  for column in columns:

    mean = df[column].mean()
    std = df[column].std()

    lower_bound = 0
    upper_bound = 500

    outliers_indices_1 = np.where(df[column] < lower_bound)
    outliers_indices_2 = np.where(df[column] > upper_bound)

    df[column].iloc[outliers_indices_1] = '0'
    df[column].iloc[outliers_indices_2] = '500'

  return df

def remove_outliers_by_IQR(df):
  print(df.shape)

  for column in columns:

    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
  print(df.shape)
  return df

def draw_outlier(df):

  sns.boxplot(data=df)
  plt.show()

### b. Word Embedding


In [None]:
!pip install contractions
import contractions
import re
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops_eng = set(stopwords.words('english')) #英文停用詞
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
Lemmatizer = WordNetLemmatizer()


# step3 fuction
def remove(phrase):
    phrase = re.sub(r"<.*?>", " ", phrase) #remove tag
    phrase = re.sub(r"[^\w\s]", " ", phrase) #remove punctuations
    phrase = re.sub(r'\d', " ", phrase) #remove numbers
    return phrase

# this is the function of text processing
def self_intro_pre(df):
    df['self_intro'] = df['self_intro'].astype(str)
    # step1:lowercase
    df['self_intro'] = df['self_intro'].str.lower()
    # step2:decontracting
    df['self_intro'] = df['self_intro'].apply(lambda x: contractions.fix(x))
    # step3 remove tags, punctuations, numbers
    df['self_intro'] = df['self_intro'].apply(remove)
    # step4 tokenization
    df['self_intro'] = df['self_intro'].str.split()
    # step5 stopword removal
    for index, row in df.iterrows():
        keywords = []
        for words in row['self_intro']:
            if words not in stops_eng:
                keywords.append(words)
        df.at[index, 'self_intro'] = keywords
    # step6 lemmatization
    for index, row in df.iterrows():
        Lemma = []
        for i in row['self_intro']:
            Lemma.append(Lemmatizer.lemmatize(i , pos='v'))
        df.at[index, 'self_intro'] = Lemma

    return df



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

#this is the function for tf-idf, need train simultaneously to ensure the selected features are the same
#and it returns only 'self_intro' , need to concat with other variable such as weight,yt.... to train the model
def calculate_tfidf(train, test):
    train['self_intro'] = train['self_intro'].astype(str) #確保都是str
    test['self_intro'] = test['self_intro'].astype(str)
    tfidf_vectorizer = TfidfVectorizer(max_features=46) #tf-idf 留頻率出現超過3次的feature(這邊已用counter計算為46個詞)
    X_train_tfidf = tfidf_vectorizer.fit_transform(train['self_intro']) #利用此46個feature做tfidf_matrix
    X_test_tfidf = tfidf_vectorizer.transform(test['self_intro'])
    feature_names = tfidf_vectorizer.get_feature_names_out() #46個feature name
    # print(feature_names)

    # 用卡方檢定做feature selection
    k_best = SelectKBest(chi2, k=20)
    X_train_tfidf_sel = k_best.fit_transform(X_train_tfidf, train['gender'])
    X_test_tfidf_sel = k_best.transform(X_test_tfidf)
    selected_features_indices = k_best.get_support(indices=True)
    selected_features = [feature_names[i] for i in selected_features_indices]
    tfidf_train = pd.DataFrame(X_train_tfidf_sel.toarray(), columns =selected_features)
    tfidf_test = pd.DataFrame(X_test_tfidf_sel.toarray(), columns =selected_features)

    return tfidf_train, tfidf_test

### c. Model

In [None]:
def run_random_forest(x_train, y_train, test_data):

  rf = RandomForestClassifier(random_state=42)

  rf.fit(x_train, y_train)
  y_pred = rf.predict(test_data)
  y_pred = pd.DataFrame(y_pred, columns = ["gender"])

  return y_pred


def run_random_forest_acc(x_train, y_train, x_test, y_test):

  rf = RandomForestClassifier(random_state=42)

  rf.fit(x_train, y_train)
  y_pred = rf.predict(x_test)
  acc = accuracy_score(y_test, y_pred)
  print('------------rf-------------')
  print(acc)
  print(confusion_matrix(y_test, y_pred))
  return y_pred


def run_svm(x_train, y_train, test_data, set_kernel=None):

  if set_kernel == None :
    set_kernel = 'rbf'

  svm = SVC(kernel=set_kernel)

  svm.fit(x_train, y_train)
  y_pred = svm.predict(test_data)
  y_pred = pd.DataFrame(y_pred, columns = ["gender"])

  return y_pred


def run_svm_acc(x_train, y_train, x_test, y_test, set_kernel=None):

  if set_kernel == None :
    set_kernel = 'rbf'

  svm = SVC(kernel=set_kernel)

  svm.fit(x_train, y_train)
  y_pred = svm.predict(x_test)
  acc = accuracy_score(y_test, y_pred)
  print('------------svm-------------')
  print(acc)
  print(confusion_matrix(y_test, y_pred))

  return y_pred


# KNN model
def KNN(x_train, y_train, x_test, y_test):
  knn_classifier = KNeighborsClassifier(n_neighbors=5)
  knn_classifier.fit(x_train, y_train)
  knn_y_pred = knn_classifier.predict(x_test)
  accuracy = accuracy_score(y_test, knn_y_pred)

  return accuracy

## Main

In [None]:
df = pd.read_csv('drive/My Drive/Colab Notebooks/datasets/boy/boy or girl 2024 train_missingValue.csv')
df_test = pd.read_csv('drive/My Drive/Colab Notebooks/datasets/boy/boy or girl 2024 test no ans_missingValue.csv')

#tfidf
df_intro, df_test_intro = calculate_tfidf(df, df_test)

# 前處理
x = data_preprocess(df)
df_test = data_preprocess(df_test)

y = x['gender']

x = x.merge(df_intro, how='left', left_index=True, right_index=True)
df_test = df_test.merge(df_test_intro, how='left', left_index=True, right_index=True)
x = x.drop(['self_intro', 'gender'], axis=1)
df_test = df_test.drop(['self_intro', 'gender'], axis=1)

oversample = SMOTE(random_state=42)
x, y = oversample.fit_resample(x, y)

#分類器
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y);

# RandomForest
y_pred = run_random_forest_acc(x_train, y_train, x_test, y_test)
print(y_pred)
y_pred = run_random_forest(x, y, df_test)
y_pred.to_csv('/content/drive/My Drive/Colab Notebooks/output/rf.csv', index=False, encoding='utf-8')

# svm
# y_predict = run_svm_acc(x_train, y_train, x_test, y_test, 'linear')
# print(y_predict)
# y_predict = run_svm(x, y, df_test, 'linear')
# y_predict.to_csv('/content/drive/My Drive/Colab Notebooks/output/svm.csv', index=False, encoding='utf-8')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].iloc[outliers_indices_1] = '0'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].iloc[outliers_indices_1] = '0'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].iloc[outliers_indices_1] = '0'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].iloc[outliers_indices_1] = '0'


------------rf-------------
0.9291338582677166
[[57  7]
 [ 2 61]]
[2 2 1 2 2 1 2 2 2 1 2 2 1 2 2 1 2 2 1 2 2 1 2 1 2 1 2 2 2 1 1 2 2 2 1 2 1
 2 1 2 1 2 1 2 2 1 2 1 2 2 1 2 1 1 2 2 1 1 2 1 2 2 2 1 1 2 2 1 2 2 2 2 2 1
 2 1 2 1 1 1 2 2 2 2 1 2 1 1 1 1 1 2 2 1 1 1 2 2 1 2 1 1 1 2 1 2 1 1 2 1 2
 1 1 1 2 1 2 1 2 1 1 2 2 1 2 1 1]
