`Importing modules`

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

`Reading the data from the csv file and making a pandas dataframe of it`

In [None]:
df = pd.read_csv('profiles.csv', encoding='utf-8')

`Gathering some info about the dataframe`

In [None]:
df.info()

In [None]:
print('Some descriptive statistic data of numerical variables')
df.describe()

In [None]:
print('You can see above that the height and the income columns include data unrealistic.')

In [None]:
print('Unique values of income')
df.income.unique()

In [None]:
df[df.income == -1].head(1)

In [None]:
print('The number of observations with income value -1 is: ', len(df[df.income == -1]))

In [None]:
print('''We could drop the whole income column because -1 is not a usual categorical variable and definitely not
a continuous numerical variable. Thus we cannot use them or replace the values without avoiding bias.''')

In [None]:
df = df.drop('income', axis=1)

In [None]:
#df.religion.unique()
df.pets.unique()

In [None]:
print('Unique values of height')
df.height.unique()

In [None]:
print('''The smallest 18 year-old-man in the world is 29 inches, thus we can replace, or drop the values if they are lower than that.
https://www.mirror.co.uk/news/world-news/worlds-shortest-teenager-named-18-27050545''')

In [None]:
invalid_heights = (df.height <= 29) | (df.height.isna())

In [None]:
df = df[~invalid_heights]

In [None]:
df.describe()

In [None]:
df.height.unique()

`Converting inch to cm`

In [None]:
df.height = df.height*2.54

`Take a look at the "nan" values of the whole dataframe`

In [None]:
df_isna = df.isna().sum().sort_values()
#df_isna

`Choosing some categorical and numerical variables (with zero or little null values) and put them into a list`

In [None]:
cats_and_nums = ['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'height', 'orientation', 'religion', 'sex', 'sign', 'smokes', 'status']

In [None]:
df.sex.replace(['m', 'f'], ['male', 'female'], inplace=True)

In [None]:
#for feature in cats_and_nums:
    #print(df[feature].value_counts())

In [None]:
#df = df[cats_and_nums].dropna()

In [None]:
def unique_and_fill(features: list, df: df) -> list:
    for feature in features:
        if df[feature].dtype not in ['int64', 'float64']:
            df[feature] = df[feature].fillna('unknown')
            yield feature, df[feature].unique()

In [None]:
list((unique_and_fill(cats_and_nums, df)))

`Making a function which deals with features having many unique values in a way of making groups.`

In [None]:
import itertools.product as pro

In [None]:
def change_values(new_values: list, df: df, feature: str) -> df:
    for value in new_values:
        for element in df[feature].unique():
            if value in element:
                df[feature] = df[feature].replace(element, value)
    return df[feature]

`Making groups from signs, diets and religions by using the above function`

In [None]:
signs_ok = ['gemini', 'cancer', 'pisces', 'aquarius', 'taurus', 'virgo', 'sagittarius', 'leo', 'aries', 'libra', 'scorpio', 'capricorn']
sings_change = change_values(signs_ok, df, 'sign')
diets_ok = ['kosher', 'halal', 'anything', 'other', 'vegetarian', 'vegan']
diets_change = change_values(diets_ok, df, 'diet')
religions_ok = ['agnosticism', 'atheism', 'christianity', 'other', 'catholicism', 'buddhism', 'judaism', 'hinduism', 'islam']
religion_change = change_values(religions_ok, df, 'religion')
print(sings_change)

`Manual data manipulation of education unique values`

In [None]:
edu_unique = ['working on college/university', 'working on space camp', 'graduated from masters program', 'graduated from college/university',
'working on two-year college', 'unknown', 'graduated from high school', 'working on masters program', 'graduated from space camp',
'college/university', 'dropped out of space camp', 'graduated from ph.d program', 'graduated from law school', 'working on ph.d program',
'two-year college','graduated from two-year college', 'working on med school', 'dropped out of college/university', 'space camp',
'graduated from med school', 'dropped out of high school', 'working on high school', 'masters program', 'dropped out of ph.d program',
'dropped out of two-year college', 'dropped out of med school', 'high school', 'working on law school', 'law school',
'dropped out of masters program', 'ph.d program', 'dropped out of law school', 'med school']

In [None]:
high_school = ['working on college/university', 'working on two-year college', 'graduated from high school', 'working on masters program',
'high school','working on law school', 'working on high school']
college = ['graduated from law school', 'two-year college', 'graduated from two-year college','working on med school', 'law school']
master = ['graduated from masters program', 'graduated from college/university', 'college/university', 'working on ph.d program',
'masters program']
phd = ['graduated from ph.d program', 'graduated from med school', 'ph.d program', 'med school']
other = ['working on space camp', 'graduated from space camp', 'space camp']
dropped = ['dropped out of space camp', 'dropped out of college/university', 'dropped out of high school', 'dropped out of ph.d program',
'dropped out of two-year college', 'dropped out of med school', 'dropped out of masters program','dropped out of law school']

In [None]:
def list_replacement(df: df, feature: str, old_values: list, new_values: list) -> df:
    for i in range(len(old_values)):
        df[feature] = df[feature].replace(old_values[i], new_values[i])
    return df[feature]

In [None]:
all_in = [high_school, college, master, phd, other, dropped]
new = ['high school', 'college', 'master', 'phd', 'other', 'dropped out']
edu_test = list_replacement(df, 'education', all_in, new)

`Making pie charts, countplots and histograms of the features from the list`

In [None]:
def charts(features: list, dataframe: df) -> plt:
    for feat in features:
        plt.figure(figsize=[10, 6])
        sns.set_palette('Set1')
        data = dataframe[feat]
        plt.title(f'Distribution of {feat} of {len(data)} participants')
        if data.dtype == 'object':
            if data.nunique() < 5:
                plt.pie(data.value_counts(), autopct='%d%%', pctdistance=0.85, explode=[0.035 for i in range(data.nunique())])
                plt.legend(data.unique())
                plt.gcf().gca().add_artist(plt.Circle((0, 0), 0.7, facecolor='white'))
            else:
                sns.countplot(y=feat, data=dataframe, saturation=0.75, palette='Set1', hue='sex')
                plt.xticks(rotation=15)
        elif data.dtype in ['int64', 'float64']:
            sns.histplot(x=feat, data=dataframe, hue='sex', bins=30)
            mean = np.mean(data).round()
            plt.axvline(mean, color='red')
            plt.text(x=mean*1.1, y=7000, s=f'Mean: {mean}')
        plt.show()
        plt.clf()

In [None]:
test_charts = charts(cats_and_nums, df)

`Making some dataframes by copying the original`

In [None]:
df_model = df.copy()
df_new = df.copy()

In [None]:
df_model.head(1)

In [None]:
df_model.sex = df_model.sex.map({'male':1, 'female':0})
df_model.orientation = df_model.orientation.replace(['straight', 'bisexual', 'gay'], [2, 1, 0])
df_model.status = df_model['status'].replace(['single', 'available'], 1)
df_model.status = df_model['status'].replace(['married', 'seeing someone', 'unknown'], 0)

df_model.smokes = np.where(df_model.smokes == 'no', 1, 0)
df_model.drugs = np.where(df_model.drugs == 'never', 1, 0)

`Making some labels automatically`

In [None]:
encodeable = [feat for feat in cats_and_nums if feat not in ['sex', 'orientation', 'status', 'smokes', 'drugs', 'age', 'height']]

In [None]:
for uncoded in encodeable:
    encoder = LabelEncoder()
    df_model[uncoded] = encoder.fit_transform(df_model[uncoded])

In [None]:
df_model = df_model[cats_and_nums]

In [None]:
#scaler = StandardScaler()

In [None]:
#df_model = scaler.fit_transform(df_model)
#df_model

`Making a function to build and use LogisticRegression model.`

In [None]:
def model_log(df: df, target: str) -> dict:
    
    df = df[df[target] != 'unknown']
    df = df.sample(int(.25*len(df)))
    X = df.drop(target, axis=1)
    y = df[target]
    scaler = StandardScaler()
    scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=15)
    model = LogisticRegression(max_iter=3500, solver='saga')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return {'Logistic model': model._estimator_type, 'Features': list(X.columns), 'Target': target, 'Accuracy': accuracy}

In [None]:
log_test_sign = model_log(df_model, 'sign')

In [None]:
print(log_test_sign)

In [None]:
##encoder = LabelEncoder()
#df.sign = encoder.fit_transform(df.sign)

In [None]:
target = 'orientation'
sample = df_model.sample(10000, replace=True)
X = sample.drop(target, axis=1)
y = sample[target]

In [None]:
X

In [None]:
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [None]:
bag_dt = BaggingClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=10)
bag_dt.fit(x_train, y_train)
bag_accuracy = bag_dt.score(x_test, y_test)
print('Accuracy score of Bagged Classifier, 10 estimators:')
print(bag_accuracy)

In [None]:
np.random.seed(0)
accus_test = []
neighbors = range(1,100)
for neighbor in neighbors:
    model = KNeighborsClassifier(n_neighbors=neighbor)
    StandardScaler().fit_transform(X)
    model.fit(x_train, y_train)
    y_pred_test = model.predict(x_test)
    accus_test.append(accuracy_score(y_test, y_pred_test))
best_acc_neighbors = np.max(accus_test)
best_neighbors = neighbors[np.argmax(accus_test)]
print('Best accuracy with using k Nearest neighbors is: ', best_acc_neighbors, ' and the best number of neighbors is: ', best_neighbors)

In [None]:
np.random.seed(0)
accuracy_train=[]
accuracy_test = []
depths = range(1,20)
for depth in depths:
  model = RandomForestClassifier(max_depth=depth)
  model.fit(x_train, y_train)
  y_pred_train = model.predict(x_train)
  y_pred_test = model.predict(x_test)
  accuracy_train.append(accuracy_score(y_train, y_pred_train))
  accuracy_test.append(accuracy_score(y_test, y_pred_test))
best_acc = np.max(accuracy_test)
best_depth = depths[np.argmax(accuracy_test)]
print('Best accuracy with using Randomforest is: ', best_acc, ' and the best depth is: ', best_depth)

In [None]:
plt.plot(depths, accuracy_train, 'r-o', depths, accuracy_test, 'b-*')
plt.title('Accuracy of training and test data')
plt.xlabel('depth')
plt.ylabel('accuracy')
plt.legend(['training accuracy', 'test accuracy'])
plt.show()
plt.clf()

In [None]:
best_rf = RandomForestClassifier(max_depth=best_depth)
best_rf.fit(x_train, y_train)

In [None]:
feature_importance = pd.DataFrame(zip(x_train.columns, best_rf.feature_importances_), columns=['features', 'importances']).sort_values('importances', ascending=False)
feature_importance.iloc[:5]

In [None]:
features = ['sign', 'age', 'height', 'sex', 'orientation', 'body_type', 'diet', 'drinks',  'smokes', 'drugs', 'education', 'religion', 'status']

In [None]:
df_new = df_new[features][df_new.sign != 'unknown']

In [None]:
X_new = pd.get_dummies(df_new[features[1:]], drop_first=True)
X_new

In [None]:
y_new = df_new[features[0]]

In [None]:
y_new

In [None]:
neighbors = KNeighborsClassifier(n_neighbors=1)

In [None]:
scaler = StandardScaler()

In [None]:
X_new = scaler.fit_transform(X_new)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_new, y_new, test_size=.25, random_state=100)

In [None]:
neighbors.fit(x_train, y_train)

In [None]:
y_pred = neighbors.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
np.random.seed(0)
accus_test = []
neighbors = range(1,50)
for neighbor in neighbors:
    model = KNeighborsClassifier(n_neighbors=neighbor)
    model.fit(x_train, y_train)
    y_pred_test = model.predict(x_test)
    accus_test.append(accuracy_score(y_test, y_pred_test))
best_acc_neighbors = np.max(accus_test)
best_neighbors = neighbors[np.argmax(accus_test)]
print('Best accuracy with using k Nearest neighbors is: ', best_acc_neighbors, ' and the best number of neighbors is: ', best_neighbors)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
array = np.array(X_new)
array