In [1]:
#imports
import pandas as pd 
import numpy as np

In [2]:
#reading csv into data frame
df = pd.read_csv('player_records.csv')
df.head()

Unnamed: 0,Player Number,Age,Sex,Debt,Survival
0,2,tr,Male,tr,survived
1,3,23,tr,heavy debt,survived
2,5,35,tr,debt,survived
3,6,tr,tr,debt,survived
4,7,tr,Female,heavy debt,survived


In [3]:
#data pre-processing

#replacing all tr to NaN 
df.replace('tr', np.nan, inplace=True)
df.head()

Unnamed: 0,Player Number,Age,Sex,Debt,Survival
0,2,,Male,,survived
1,3,23.0,,heavy debt,survived
2,5,35.0,,debt,survived
3,6,,,debt,survived
4,7,,Female,heavy debt,survived


In [4]:
#Age column

#convert age to float 
df['Age'] = df['Age'].astype(float)

#getting youngest and oldest age
youngest = df['Age'].min()
oldest = df['Age'].max()

#replacing NaN values with a randomly generated number
#between youngest and oldeset
df['Age'] = df['Age'].apply(lambda x: np.random.randint(youngest, oldest + 1) if np.isnan(x) else x)
df.head()

Unnamed: 0,Player Number,Age,Sex,Debt,Survival
0,2,24.0,Male,,survived
1,3,23.0,,heavy debt,survived
2,5,35.0,,debt,survived
3,6,24.0,,debt,survived
4,7,49.0,Female,heavy debt,survived


In [5]:
#Sex column

#replacing Male and Female with 0 and 1 
df['Sex'].replace(['Male', 'Female'], [0,1], inplace=True)

#forward filling
df['Sex'].fillna(method='ffill', inplace=True)

df['Sex'] = df['Sex'].astype(float)

df.head()

Unnamed: 0,Player Number,Age,Sex,Debt,Survival
0,2,24.0,0.0,,survived
1,3,23.0,0.0,heavy debt,survived
2,5,35.0,0.0,debt,survived
3,6,24.0,0.0,debt,survived
4,7,49.0,1.0,heavy debt,survived


In [6]:
#Debt column

#replacing NaN in debt
debtArray = ['debt', 'heavy debt']
df['Debt'].fillna(debtArray[np.random.randint(0,2)], inplace=True)

# Define a function to replace 'debt' and 'heavy debt' with random values
def replace_debt(row):
    if row == 'debt':
        return np.random.randint(100000000, 700000001)
    else:
        return np.random.randint(700000001, 1500000000)


# Apply the function to the 'Debt' column
df['Debt'] = df['Debt'].apply(replace_debt)

df.head()

Unnamed: 0,Player Number,Age,Sex,Debt,Survival
0,2,24.0,0.0,514998526,survived
1,3,23.0,0.0,1386364609,survived
2,5,35.0,0.0,497943410,survived
3,6,24.0,0.0,577589729,survived
4,7,49.0,1.0,748008850,survived


In [7]:
# Ennumerating survival column

df['Survival'].replace(['survived', 'died'], [0,1], inplace=True)
df.head()

Unnamed: 0,Player Number,Age,Sex,Debt,Survival
0,2,24.0,0.0,514998526,0
1,3,23.0,0.0,1386364609,0
2,5,35.0,0.0,497943410,0
3,6,24.0,0.0,577589729,0
4,7,49.0,1.0,748008850,0


In [8]:
#Splitting training and testing sets 
from sklearn.model_selection import train_test_split 

#get the output 
df_output = df['Survival']

#get the inputs 
df_inputs = df.drop(columns=['Survival', 'Player Number'])

df_inputs.head()

#prepare the training and testing datasets by a ratio of 70:30
x_train, x_test, y_train, y_test = train_test_split(df_inputs, df_output, test_size=0.3, random_state = 1) 

In [9]:
#Model training with training data 
from sklearn.linear_model import LinearRegression
lr = LinearRegression() 
lr.fit(x_train, y_train) 

print('Corefficient: ', lr.coef_)
print('Intercept', lr.intercept_)

Corefficient:  [ 8.23194962e-03 -2.78775198e-02 -3.05520637e-10]
Intercept 0.15638561269784365


In [10]:
#predicting 

from sklearn import metrics

y_pred = lr.predict(x_test)

print(metrics.mean_squared_error(y_test,y_pred))

0.12675909386738454


In [11]:
#exploring the results
import matplotlib as plt 

plt.scatter(x_test,y_test,color='b')
plt.plot(x_test,y_pred,color='k')
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
#Predicting if I would die in squid game 