In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers




In [2]:
# data = pd.read_csv('Dataset Mental Health.csv', encoding = 'utf-8-sig')
data = pd.read_csv('Dataset Mental Health.csv')
data.head(3)

Unnamed: 0,Name,Age,Status,Price,Methods,User Viewed,Rating,Category,Domicile,Gender
0,Monica Putri Yani,21,Psychology,"Rp85.000,00",Online,200,350,Family,Online,Female
1,Mohammad Candra,18,Consultation,"Rp55.000,00",Online,450,400,Finance,Online,Male
2,Salma fitriyani,32,Psychology,"Rp60.000,00",Offline,120,370,Finance,Gunung Kidul,Female


In [3]:
# mengubah isi kolom jenis kelamin dari text menjadi integer (Laki-laki = 1; Perempuan= 0)
gender = {"Male" : 1, "Female" : 0}
data["Gender "] = data["Gender "].map(gender)
data.head(5)

Unnamed: 0,Name,Age,Status,Price,Methods,User Viewed,Rating,Category,Domicile,Gender
0,Monica Putri Yani,21,Psychology,"Rp85.000,00",Online,200,350,Family,Online,0
1,Mohammad Candra,18,Consultation,"Rp55.000,00",Online,450,400,Finance,Online,1
2,Salma fitriyani,32,Psychology,"Rp60.000,00",Offline,120,370,Finance,Gunung Kidul,0
3,Mega,40,Psychiatrist,"Rp125.000,00",Hybrid,350,500,Sexual,Jakarta Pusat,0
4,Himawari Yuka,22,Psychiatrist,"Rp100.000,00",Online,188,498,Sexual,Online,0


In [4]:
# Mengecek distribusi jenis kelamin pada dataset
num_obs = len(data)
num_true = len(data.loc[data['Gender '] == 1])
num_false = len(data.loc[data['Gender '] == 0])
print("Number of Men:  {0} ({1:2.2f}%)".format(num_true, (num_true/num_obs) * 100))
print("Number of Female: {0} ({1:2.2f}%)".format(num_false, (num_false/num_obs) * 100))

Number of Men:  167 (50.00%)
Number of Female: 167 (50.00%)


In [5]:
from sklearn.model_selection import train_test_split

feature_col_names = ["Name"]
predicted_class_names = ["Gender "]

X = data[feature_col_names].values     
y = data[predicted_class_names].values
split_test_size = 0.30

text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, stratify=y, random_state=42)


In [6]:
print("Dataset Male       : {0} ({1:0.2f}%)".format(len(data.loc[data['Gender '] == 1]), (len(data.loc[data['Gender '] == 1])/len(data.index)) * 100.0))
print("Dataset Female     : {0} ({1:0.2f}%)".format(len(data.loc[data['Gender '] == 0]), (len(data.loc[data['Gender '] == 0])/len(data.index)) * 100.0))
print("")
print("Dataset Training Male   : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Dataset Training Female : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Dataset Test Male       : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Dataset Test Female     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))

Dataset Male       : 167 (50.00%)
Dataset Female     : 167 (50.00%)

Dataset Training Male   : 117 (50.21%)
Dataset Training Female : 116 (49.79%)

Dataset Test Male       : 50 (49.50%)
Dataset Test Female     : 51 (50.50%)


In [7]:
vectorizer = CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))
vectorizer.fit(text_train.ravel())

X_train = vectorizer.transform(text_train.ravel())
X_test = vectorizer.transform(text_test.ravel())

clf = LogisticRegression()
clf.fit(X_train, y_train.ravel())

In [8]:
clf_predict = clf.predict(X_test)

# # from sklearn.pipeline import Pipeline

# clf_lg = Pipeline([('vect', CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))),
#                      ('clf', LogisticRegression()),
# ])
# _ = clf_lg.fit(text_train.ravel(), y_train.ravel())
# predicted = clf_lg.predict(text_test.ravel())
# np.mean(predicted == y_test.ravel()) 

# Menggunakan Naive Bayes

clf_nb = Pipeline([('vect', CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))),
                     ('clf', MultinomialNB()),
])

clf_nb = clf_nb.fit(text_train.ravel(), y_train.ravel())
predicted = clf_nb.predict(text_test.ravel())
np.mean(predicted == y_test.ravel())  

0.9900990099009901

In [9]:
gender = {1:"Male", 0:"Female"}
data["Gender "] = data["Gender "].map(gender)
data

Unnamed: 0,Name,Age,Status,Price,Methods,User Viewed,Rating,Category,Domicile,Gender
0,Monica Putri Yani,21,Psychology,"Rp85.000,00",Online,200,350,Family,Online,Female
1,Mohammad Candra,18,Consultation,"Rp55.000,00",Online,450,400,Finance,Online,Male
2,Salma fitriyani,32,Psychology,"Rp60.000,00",Offline,120,370,Finance,Gunung Kidul,Female
3,Mega,40,Psychiatrist,"Rp125.000,00",Hybrid,350,500,Sexual,Jakarta Pusat,Female
4,Himawari Yuka,22,Psychiatrist,"Rp100.000,00",Online,188,498,Sexual,Online,Female
...,...,...,...,...,...,...,...,...,...,...
329,Salman,40,Psychiatrist,1000000,Offline,80,4.8,Sexual,Bandung,Male
330,Rizqi,40,Psychiatrist,1000000,Offline,80,4.8,Sexual,Bandung,Male
331,Gaskur,40,Psychiatrist,1000000,Offline,80,4.8,Sexual,Bandung,Male
332,Rifando,40,Psychiatrist,1000000,Offline,80,4.8,Sexual,Bandung,Male


In [10]:

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=13, batch_size=16, validation_data=(X_test, y_test))


# Get recommendations
def get_recommendations(key):
    df_key = data[data['Gender '].str.contains(key, case=False)]
    if not df_key.empty:
        Gender_matrix = vectorizer.transform(df_key['Gender '])
        scores = model.predict(Gender_matrix)
        top_n_indices = np.argsort(scores[:, 0])[::-1][:10]
        recommendations_df = data.loc[top_n_indices]
        return recommendations_df
    else:
        return "The Gender are not found in the dataset."



Epoch 1/13


Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


In [11]:
def GENDER():
    print("Female, Male, or All \n")
    
    menu = input("Choose to Consult? : ")

    if menu == "Female":
        print(get_recommendations("Female"))
    elif menu == "Male":
        print(get_recommendations("Male"))
    elif menu == "All":
        recommendations_female = get_recommendations("Female")
        recommendations_male = get_recommendations("Male")

        print("Recommendations for Female and Male:")
        print("Female:\n", recommendations_female)
        print("\nMale:\n", recommendations_male)
    else:
        print("Sorry, please re-enter it.")

GENDER()

Female, Male, or All 

               Name  Age        Status         Price  Methods  User Viewed  \
31      Novi Azizah   27   Psychology    Rp58.000,00   Hybrid          750   
126      Sari Indah   41  Psychiatrist        980000  Offline          145   
63             Nita   27  Psychiatrist  Rp100.000,00  Offline         1200   
254      Citra Sari   26  Psychiatrist        680000   Hybrid          105   
190    Rina Purnama   29  Psychiatrist        750000   Hybrid           95   
287      Wulan Sari   25    Psychology        550000   Online           75   
318       Siti Wati   34  Psychiatrist        820000  Offline          125   
94    Sophie Turner   28    Psychology        500000   Online          120   
158      Siti Indah   28  Consultation        750000   Online          110   
222  Nina Wulandari   28    Psychology        600000   Hybrid          110   

    Rating Category   Domicile Gender   
31    4,00   Friend   Semarang  Female  
126    4.7   Sexual    Bandung  Fema