In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
data_set = pd.read_csv("/content/drive/MyDrive/Titanic/titanic.csv")

In [4]:
miss_values = data_set.isnull().sum()
miss_percent = data_set.isnull().sum() / len(data_set) * 100

print(miss_values)
print(miss_percent)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.000000
dtype: float64


In [5]:
#İsim sutunu ve bilet numarası gibi gereksiz sutunları çıkarma
data_set = data_set.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
selected_columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [6]:
#Eksik verileri doldurma
data_set['Age'].fillna(data_set['Age'].mean(), inplace=True)
data_set['Embarked'].fillna(data_set['Embarked'].mode()[0], inplace=True)
data_set['Fare'].fillna(data_set['Fare'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Age'].fillna(data_set['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Embarked'].fillna(data_set['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedi

In [7]:
#Katagorik verileri sayısal değerlere dönüştürme
label_encoder = {}
for column in ['Sex', 'Embarked']:
  le = LabelEncoder()
  data_set[column] = le.fit_transform(data_set[column])
  label_encoder[column] = le
print(data_set.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [8]:
# Girdi ve çıktıları ayırma
X = data_set[selected_columns]
y = data_set['Survived']

In [9]:
# Veriyi eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)
print("Eğitim Seti İlk 5 Satır:\n", X_train[:5])
print("Test Seti İlk 5 Satır:\n", X_test[:5])

Eğitim Seti İlk 5 Satır:
      Pclass  Sex       Age  SibSp  Parch     Fare  Embarked
94        1    1  25.00000      0      0  26.0000         0
165       3    0  26.00000      1      1  22.0250         2
364       1    0  25.00000      1      0  55.4417         0
297       3    1  30.27259      2      0  21.6792         0
270       1    1  46.00000      0      0  75.2417         0
Test Seti İlk 5 Satır:
      Pclass  Sex       Age  SibSp  Parch     Fare  Embarked
146       1    1  30.27259      0      0  51.8625         2
223       3    1  21.00000      0      0   7.7958         2
193       2    1  61.00000      0      0  12.3500         1
305       1    0  64.00000      1      1  26.5500         2
188       3    0  30.27259      8      2  69.5500         2


In [10]:
# Veriyi ölçeklendirme
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # fit ve transform birlikte yapılıyor
X_test = scaler.transform(X_test)  # sadece transform yapılıyor
print("Eğitim Seti İlk 5 Satır:\n", X_train[:5])
print("Test Seti İlk 5 Satır:\n", X_test[:5])

Eğitim Seti İlk 5 Satır:
 [[-1.49224801  0.7361195  -0.39900493 -0.51956103 -0.43602072 -0.17165343
  -1.54816717]
 [ 0.8667097  -1.35847508 -0.31590841  0.6761934   0.92872413 -0.25214353
   0.73757021]
 [-1.49224801 -1.35847508 -0.39900493  0.6761934  -0.43602072  0.42451396
  -1.54816717]
 [ 0.8667097   0.7361195   0.03912896  1.87194783 -0.43602072 -0.25914567
  -1.54816717]
 [-1.49224801  0.7361195   1.34602191 -0.51956103 -0.43602072  0.82544578
  -1.54816717]]
Test Seti İlk 5 Satır:
 [[-1.49224801  0.7361195   0.03912896 -0.51956103 -0.43602072  0.35203845
   0.73757021]
 [ 0.8667097   0.7361195  -0.73139099 -0.51956103 -0.43602072 -0.54027177
   0.73757021]
 [-0.31276915  0.7361195   2.59246966 -0.51956103 -0.43602072 -0.4480534
  -0.40529848]
 [-1.49224801 -1.35847508  2.84175921  0.6761934   0.92872413 -0.16051644
   0.73757021]
 [ 0.8667097  -1.35847508  0.03912896  9.04647441  2.29346899  0.71019408
   0.73757021]]


In [11]:
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state = 0)
classifier = classifier.fit(X_train, y_train)

In [12]:
# Eğitim verisi doğruluğu
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print("Eğitim verisi doğruluğu: {:.2f}%".format(training_data_accuracy * 100))

Eğitim verisi doğruluğu: 100.00%


In [13]:
# Test verisi doğruluğu
X_test_prediction = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(y_test, X_test_prediction)
print("Test verisi doğruluğu: {:.2f}%".format(testing_data_accuracy * 100))

Test verisi doğruluğu: 100.00%
