`Importing some modules`

In [1536]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

`Reading the data from the csv file and making a pandas dataframe of it`

In [1537]:
df = pd.read_csv('profiles.csv')

`Gathering some info about the dataframe`

In [None]:
df.info()

In [None]:
print('Some descriptive statistic data of numerical variables')
df.describe()

In [None]:
print('You can see above that the height and the income columns include data unrealistic.')

In [None]:
print('Unique values of income')
df.income.unique()

In [None]:
df[df.income == -1].head(2)

In [None]:
print('The number of observations with income value -1 is: ', len(df[df.income == -1]))

In [None]:
print('''We could drop the whole income column because -1 is not a usual categorical variable and definitely not
a continuous numerical variable. Thus we cannot use them or replace the values without avoiding bias.''')

In [None]:
df = df.drop('income', axis=1)

In [None]:
print('Unique values of height')
df.height.unique()

In [None]:
print('''The smallest 18 year-old-man in the world is 29 inches, thus we can replace, or drop the values if they are lower than that.
https://www.mirror.co.uk/news/world-news/worlds-shortest-teenager-named-18-27050545''')

In [None]:
df = df[~((df.height <= 29) | (df.height.isna()))]

In [None]:
df.describe()

In [None]:
df.height.unique()

`Converting inch to cm`

In [None]:
df.height = df.height*2.54

`Take a look at the "nan" values of the whole dataframe`

In [None]:
df_isna = df.isna().sum().sort_values()
#df_isna

In [None]:
df.sex.replace(['m', 'f'], ['male', 'female'], inplace=True)

`Choosing some categorical and numerical variables (with zero null values) and put them into a list`

In [None]:
cats_and_nums = ['age', 'body_type', 'diet', 'drinks', 'drugs', 'height', 'orientation', 'sex', 'smokes', 'status']

In [None]:
for feature in cats_and_nums:
    print(df[feature].value_counts())

In [None]:
def unique(features, data):
    for feature in features:
        if data[feature].dtype not in ['int64', 'float64']:
            data[feature] = data[feature].fillna('unknown')
            yield feature, data[feature].unique()

In [None]:
test_uniques= list(unique(cats_and_nums, df))

In [None]:
test_uniques

`Making pie charts, countplots and histograms of the features from the list`

In [None]:
def charts(features, dataframe):
    for feat in features:
        plt.figure(figsize=[10, 7])
        data = dataframe[feat]
        title = f'Distribution of {feat} of {len(data)} participants'
        if data.dtype == 'object':
            if data.nunique() < 6:
                plt.pie(data.value_counts(), autopct='%d%%', pctdistance=.85, explode=[0.05 for i in range(data.nunique())],
                colors=sns.color_palette('Set1'))
                plt.title(title)
                plt.legend(data.unique())
                plt.gcf().gca().add_artist(plt.Circle((0, 0), 0.7, facecolor='lightblue'))
            else:
                sns.countplot(x=data, saturation=0.75, palette='Set1')
                plt.title(title)
                plt.xticks(rotation=35, fontsize=9)
        elif data.dtype in ['int64', 'float64']:
            sns.histplot(x=feat, data = dataframe, hue='sex', bins=30)
            plt.title(title)
            mean = np.mean(data).round()
            plt.axvline(mean, color='red')
            plt.text(x=mean*1.1, y=7000, s=f'Mean: {mean}')
        plt.show()
        plt.clf()

In [None]:
test_charts = charts(cats_and_nums, df)

`Making some labels manually`

In [None]:
df.sex = np.where(df['sex'] == 'male', 1, 0)
df.orientation = df.orientation.replace(['straight', 'bisexual', 'gay'], [2, 1, 0])
df['status'] = df['status'].replace(['single', 'available'], 1)
df['status'] = df['status'].replace(['married', 'seeing someone', 'unknown'], 0)

`Making some labels automatically`

In [None]:
encodeable = ['body_type', 'diet', 'drinks', 'drugs', 'smokes']

In [None]:
encodeable

In [None]:
for uncoded_feature in encodeable:
    encoder = LabelEncoder()
    df[uncoded_feature] = encoder.fit_transform(df[uncoded_feature])
    

In [None]:
df_model = df[cats_and_nums]

In [None]:
df_model

`Making a function to build and use LogisticRegression model.`

In [None]:
def model_log(dataframe, feature):
    X = dataframe.drop(feature, axis=1)
    y = dataframe[feature]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=15)
    model = LogisticRegression(max_iter=3000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return model._estimator_type, list(X.columns), feature, accuracy

In [None]:
log_test1 = model_log(df_model, 'orientation')
print(log_test1)

In [None]:
log_test2 = model_log(df_model, 'sex')
print(log_test2)

In [None]:
log_test3 = model_log(df_model, 'status')
print(log_test3)

In [None]:
log_test4 = model_log(df_model, 'drinks')
print(log_test4)

In [None]:
#df.sign.unique()

In [None]:
sign_dict = {'gemini', 'cancer', 'pisces but it doesn&rsquo;t matter', 'pisces',
       'aquarius', 'taurus', 'virgo', 'sagittarius',
       'gemini but it doesn&rsquo;t matter',
       'cancer but it doesn&rsquo;t matter',
       'leo but it doesn&rsquo;t matter', nan,
       'aquarius but it doesn&rsquo;t matter',
       'aries and it&rsquo;s fun to think about',
       'libra but it doesn&rsquo;t matter',
       'pisces and it&rsquo;s fun to think about', 'libra',
       'taurus but it doesn&rsquo;t matter',
       'sagittarius but it doesn&rsquo;t matter',
       'scorpio and it matters a lot',
       'gemini and it&rsquo;s fun to think about',
       'leo and it&rsquo;s fun to think about',
       'cancer and it&rsquo;s fun to think about',
       'libra and it&rsquo;s fun to think about',
       'aquarius and it&rsquo;s fun to think about',
       'virgo but it doesn&rsquo;t matter',
       'scorpio and it&rsquo;s fun to think about',
       'capricorn but it doesn&rsquo;t matter', 'scorpio',
       'capricorn and it&rsquo;s fun to think about', 'leo',
       'aries but it doesn&rsquo;t matter', 'aries',
       'scorpio but it doesn&rsquo;t matter',
       'sagittarius and it&rsquo;s fun to think about',
       'libra and it matters a lot',
       'taurus and it&rsquo;s fun to think about',
       'leo and it matters a lot',
       'virgo and it&rsquo;s fun to think about',
       'cancer and it matters a lot', 'capricorn',
       'pisces and it matters a lot', 'aries and it matters a lot',
       'capricorn and it matters a lot', 'aquarius and it matters a lot',
       'sagittarius and it matters a lot', 'gemini and it matters a lot',
       'taurus and it matters a lot', 'virgo and it matters a lot'}

In [None]:
df.location.unique()

In [None]:
df.sex

In [None]:
df.columns

In [None]:
df_new = df.drop(['essay0','essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9',], axis=1)

In [None]:
df_new.head(2)

In [None]:
df_new.body_type.unique(), df_new.drugs.unique(), df_new.drinks.unique()

In [None]:
features = list(df_new.columns)

In [None]:
features

In [None]:
df_new.ethnicity.unique()

In [None]:
features.pop(6)

In [None]:
np.max(df_new.last_online.unique())

In [None]:
np.min(df_new.last_online.unique())

In [None]:
fix_features = ['age', 'height', 'income', 'sex']

In [None]:
varied_features = np.random.choice([feature for feature in features if feature != fix_features], 3)

In [None]:
final_features = fix_features + list(varied_features)

In [None]:
final_features

In [None]:
pd.get_dummies(df_new[final_features], drop_first=True)

In [None]:
df.status.unique()

In [None]:
df.status.unique()

In [None]:
y = df.status