`Importing some modules`

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

`Reading the data from the csv file and making a pandas dataframe of it`

In [None]:
df = pd.read_csv('profiles.csv')

`Gathering some info about the dataframe`

In [None]:
df.info()

In [None]:
print('Some descriptive statistic data of numerical variables')
df.describe()

In [None]:
print('You can see above that the height and the income columns include data unrealistic.')

In [None]:
print('Unique values of income')
df.income.unique()

In [None]:
df[df.income == -1].head(1)

In [None]:
print('The number of observations with income value -1 is: ', len(df[df.income == -1]))

In [None]:
print('''We could drop the whole income column because -1 is not a usual categorical variable and definitely not
a continuous numerical variable. Thus we cannot use them or replace the values without avoiding bias.''')

In [None]:
df = df.drop('income', axis=1)

In [None]:
print('Unique values of height')
df.height.unique()

In [None]:
print('''The smallest 18 year-old-man in the world is 29 inches, thus we can replace, or drop the values if they are lower than that.
https://www.mirror.co.uk/news/world-news/worlds-shortest-teenager-named-18-27050545''')

In [None]:
df = df[~((df.height <= 29) | (df.height.isna()))]

In [None]:
df.describe()

In [None]:
df.height.unique()

`Converting inch to cm`

In [None]:
df.height = df.height*2.54

`Take a look at the "nan" values of the whole dataframe`

In [None]:
df_isna = df.isna().sum().sort_values()
#df_isna

In [None]:
df.sign = df.sign.fillna('unknown')

`Choosing some categorical and numerical variables (with zero or little null values) and put them into a list`

In [None]:
cats_and_nums = ['age', 'body_type', 'diet', 'drinks', 'drugs', 'height', 'orientation', 'sex', 'sign', 'smokes', 'status']

In [None]:
df.sex.replace(['m', 'f'], ['male', 'female'], inplace=True)

In [None]:
for feature in cats_and_nums:
    print(df[feature].value_counts())

In [None]:
signs_ok = ['gemini', 'cancer', 'pisces', 'aquarius', 'taurus', 'virgo', 'sagittarius', 'leo', 'aries', 'libra', 'scorpio', 'capricorn']

In [None]:
for target in signs_ok:
    for elem in df.sign.unique():
        if target in elem:
            df.sign = df.sign.replace(elem, target)

In [None]:
def unique(features, data):
    for feature in features:
        if data[feature].dtype not in ['int64', 'float64']:
            data[feature] = data[feature].fillna('unknown')
            yield feature, data[feature].unique()

In [None]:
test_uniques= list(unique(cats_and_nums, df))

In [None]:
test_uniques

`Making pie charts, countplots and histograms of the features from the list`

In [None]:
def charts(features, dataframe):
    for feat in features:
        plt.figure(figsize=[10, 7])
        data = dataframe[feat]
        title = f'Distribution of {feat} of {len(data)} participants'
        if data.dtype == 'object':
            if data.nunique() < 6:
                plt.pie(data.value_counts(), autopct='%d%%', pctdistance=.85, explode=[0.05 for i in range(data.nunique())],
                colors=sns.color_palette('Set1'))
                plt.title(title)
                plt.legend(data.unique())
                plt.gcf().gca().add_artist(plt.Circle((0, 0), 0.7, facecolor='white'))
            else:
                sns.countplot(x=data, saturation=0.75, palette='Set1')
                plt.title(title)
                plt.xticks(rotation=35, fontsize=9)
        elif data.dtype in ['int64', 'float64']:
            sns.histplot(x=feat, data = dataframe, hue='sex', bins=30)
            plt.title(title)
            mean = np.mean(data).round()
            plt.axvline(mean, color='red')
            plt.text(x=mean*1.1, y=7000, s=f'Mean: {mean}')
        plt.show()
        plt.clf()

In [None]:
test_charts = charts(cats_and_nums, df)

`Making some labels manually`

In [None]:
df_model = df.copy()

In [None]:
df_model.head(1)

In [None]:
df_model.sex = np.where(df_model['sex'] == 'male', 1, 0)
df_model.orientation = df_model.orientation.replace(['straight', 'bisexual', 'gay'], [2, 1, 0])
df_model.status = df_model['status'].replace(['single', 'available'], 1)
df_model.status = df_model['status'].replace(['married', 'seeing someone', 'unknown'], 0)

`Making some labels automatically`

In [None]:
encodeable = ['body_type', 'diet', 'drinks', 'drugs', 'smokes', 'sign']

In [None]:
for uncoded_feature in encodeable:
    encoder = LabelEncoder()
    df_model[uncoded_feature] = encoder.fit_transform(df_model[uncoded_feature])

In [None]:
df_model = df_model[cats_and_nums]

In [None]:
df_model

`Making a function to build and use LogisticRegression model.`

In [None]:
def model_log(dataframe, target):
    X = dataframe.drop(target, axis=1)
    y = dataframe[target]
    scaler = StandardScaler()
    scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=15)
    model = LogisticRegression(max_iter=3000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {'Logistic model': model._estimator_type, 'Features': list(X.columns), 'Target': target, 'Accuracy': accuracy}

In [None]:
log_test1 = model_log(df_model, 'orientation')
log_test2 = model_log(df_model, 'sex')
log_test3 = model_log(df_model, 'status')
log_test4 = model_log(df_model, 'drugs')
log_test5 = model_log(df_model, 'sign')

In [None]:
print(log_test1)
print(log_test2)
print(log_test3)
print(log_test4)
print(log_test5)

In [None]:
encoder = LabelEncoder()
df.sign = encoder.fit_transform(df.sign)

In [None]:
df.sign

In [None]:
X = pd.get_dummies(df[['age', 'body_type', 'drinks', 'drugs', 'height', 'orientation', 'sex', 'smokes', 'status']], drop_first=True)
y = df.sign

In [None]:
X

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=100)

In [None]:
rf = RandomForestClassifier(bootstrap=True)
rfe = RFE(estimator=rf, n_features_to_select=6)
rfe.fit(x_train, y_train)

In [None]:
rfe.score(x_test, y_test)