`Import the modules`

In [862]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

`Read the data from the csv file and make a pandas dataframe of that`

In [863]:
df = pd.read_csv('profiles.csv')

`Get some info about the dataframe`

In [None]:
df.info()

In [None]:
df.head(1)

`Fill the "nan" values of the height column with the average value of the series`

In [None]:
df.height = df.height.fillna(np.mean(df.height))
df.height = df.height*2.54

`Take a look at the "nan" values of the whole dataframe`

In [None]:
df_isna = df.isna().sum().sort_values()
#df_isna

`Choose some and categorical and numerical variables and put them into a list`

In [None]:
cats_and_nums = ['age', 'sex', 'orientation', 'income', 'height', 'status', 'drinks', 'body_type', 'smokes']

`Making pie charts, countplots and histograms of the features from the list`

In [None]:
for feature in cats_and_nums:
    plt.figure(figsize=[10, 7])
    data = df[feature]
    title = f'Distribution of {feature} of {len(data)} participants'
    if data.dtype == 'object':
        if data.nunique() < 6:
            plt.pie(data.value_counts(), autopct='%d%%', pctdistance=.85, explode=[0.05 for i in range(data.nunique())], colors=sns.color_palette('Set1'))
            plt.title(title)
            plt.legend(data.unique())
            plt.gcf().gca().add_artist(plt.Circle((0, 0), 0.7, facecolor='lightblue'))
        else:
            sns.countplot(x=data, saturation=0.75, palette='Set1')
            plt.title(f'Distribution of {feature}')
            plt.xticks(rotation=15, fontsize=9)
    elif data.dtype in ['int64', 'float64']:
        sns.histplot(data, bins=30, color='darkgreen')
        plt.title(title)
        mean, median = np.mean(data).round(), np.median(data).round()
        plt.vlines([mean, median], colors=['red', 'blue'], ymin=0, ymax=11000)
        plt.text(x=mean*1.1, y=11000, s=f'Mean: {mean}')
        plt.text(x=median*.8, y=11000, s='Median')
    plt.show()
    plt.clf()

In [None]:
df.smokes = df.smokes.fillna('unknown')

`Change the dtypes of "sex, orientation and status" categorical columns from object to int`

In [None]:
df.sex = np.where(df['sex'] == 'm', 1, 0)
df.orientation = df.orientation.replace(['straight', 'bisexual', 'gay'], [2, 1, 0])
df['status'] = df['status'].replace(['single', 'available', 'seeing someone'], 1)
df['status'] = df['status'].replace(['married', 'unknown'], 0)


`Make the predictor and the outcome variables.`

`X is equal to a pandas dataframe with the chosen features and y is equal to a pandas series with status`

In [None]:
X = df[cats_and_nums[:-4]]
y = df.status

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=15)

In [None]:
model1 = LogisticRegression()

In [None]:
model1.fit(X_train, y_train)

In [None]:
y_pred = model1.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred), model1.score(X_test, y_test)

In [None]:
#df.sign.unique()

In [None]:
sign_dict = {'gemini', 'cancer', 'pisces but it doesn&rsquo;t matter', 'pisces',
       'aquarius', 'taurus', 'virgo', 'sagittarius',
       'gemini but it doesn&rsquo;t matter',
       'cancer but it doesn&rsquo;t matter',
       'leo but it doesn&rsquo;t matter', nan,
       'aquarius but it doesn&rsquo;t matter',
       'aries and it&rsquo;s fun to think about',
       'libra but it doesn&rsquo;t matter',
       'pisces and it&rsquo;s fun to think about', 'libra',
       'taurus but it doesn&rsquo;t matter',
       'sagittarius but it doesn&rsquo;t matter',
       'scorpio and it matters a lot',
       'gemini and it&rsquo;s fun to think about',
       'leo and it&rsquo;s fun to think about',
       'cancer and it&rsquo;s fun to think about',
       'libra and it&rsquo;s fun to think about',
       'aquarius and it&rsquo;s fun to think about',
       'virgo but it doesn&rsquo;t matter',
       'scorpio and it&rsquo;s fun to think about',
       'capricorn but it doesn&rsquo;t matter', 'scorpio',
       'capricorn and it&rsquo;s fun to think about', 'leo',
       'aries but it doesn&rsquo;t matter', 'aries',
       'scorpio but it doesn&rsquo;t matter',
       'sagittarius and it&rsquo;s fun to think about',
       'libra and it matters a lot',
       'taurus and it&rsquo;s fun to think about',
       'leo and it matters a lot',
       'virgo and it&rsquo;s fun to think about',
       'cancer and it matters a lot', 'capricorn',
       'pisces and it matters a lot', 'aries and it matters a lot',
       'capricorn and it matters a lot', 'aquarius and it matters a lot',
       'sagittarius and it matters a lot', 'gemini and it matters a lot',
       'taurus and it matters a lot', 'virgo and it matters a lot'}

In [None]:
df.location.unique()

In [None]:
df.sex

In [None]:
df.columns

In [None]:
df_new = df.drop(['essay0','essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9',], axis=1)

In [None]:
df_new.head(2)

In [None]:
df_new.body_type.unique(), df_new.drugs.unique(), df_new.drinks.unique()

In [None]:
features = list(df_new.columns)

In [None]:
features

In [None]:
df_new.ethnicity.unique()

In [None]:
features.pop(6)

In [None]:
np.max(df_new.last_online.unique())

In [None]:
np.min(df_new.last_online.unique())

In [None]:
fix_features = ['age', 'height', 'income', 'sex']

In [None]:
varied_features = np.random.choice([feature for feature in features if feature != fix_features], 3)

In [None]:
final_features = fix_features + list(varied_features)

In [None]:
final_features

In [None]:
pd.get_dummies(df_new[final_features], drop_first=True)

In [None]:
df.status.unique()

In [None]:
df.status.unique()

In [None]:
y = df.status