In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib

df = pd.read_csv('emlak_veri_temiz_outliers_removed.csv', encoding='utf-8')

In [34]:
df.isnull().sum()

Price (TL)         0
Room               0
Size (m²)          0
Building Age    1211
IL                 0
ILCE               0
Neighborhood       0
dtype: int64

In [35]:
df_filled = df.copy()


ilce_mah_mean = df[df['Building Age'].notna()].groupby(['ILCE', 'Neighborhood'])['Building Age'].mean()
ilce_mean = df[df['Building Age'].notna()].groupby('ILCE')['Building Age'].mean()
general_mean = df['Building Age'].mean()


for idx, row in df[df['Building Age'].isna()].iterrows():
    try:
        df_filled.loc[idx, 'Building Age'] = ilce_mah_mean.loc[(row['ILCE'], row['Neighborhood'])]
    except:
        try:
            df_filled.loc[idx, 'Building Age'] = ilce_mean.loc[row['ILCE']]
        except:
            df_filled.loc[idx, 'Building Age'] = general_mean

def convert_room(room_str):
    try:
        if '+' in str(room_str):
            parts = str(room_str).split('+')
            return sum(int(part) for part in parts if part.isdigit())
        elif str(room_str).lower() == 'stüdyo':
            return 1
        else:
            return int(room_str)
    except:
        return np.nan

df_filled['Room_Numeric'] = df_filled['Room'].apply(convert_room)

if df_filled['Room_Numeric'].isnull().sum() > 0:
    df_filled['Room_Numeric'] = df_filled['Room_Numeric'].fillna(df_filled['Room_Numeric'].median())


print(f"Doldurma sonrası Building Age eksik değer sayısı: {df_filled['Building Age'].isnull().sum()}")
print(f"Room_Numeric eksik değer sayısı: {df_filled['Room_Numeric'].isnull().sum()}")


X = df_filled.drop('Price (TL)', axis=1)
y = df_filled['Price (TL)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['Size (m²)', 'Building Age', 'Room_Numeric']
categorical_features = ['ILCE', 'Neighborhood']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        n_estimators=200,
        learning_rate=0.01,
        max_depth=7,
        colsample_bytree=0.9,
        random_state=42
    ))
])

xgb_pipeline.fit(X_train, y_train)

y_pred = xgb_pipeline.predict(X_test)


rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nXGBoost Model Performansı:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.4f}")


def predict_price(ilce, mahalle, oda_sayisi, metrekare, bina_yasi):
    input_data = pd.DataFrame({
        'Room': [oda_sayisi],
        'Size (m²)': [metrekare],
        'Building Age': [bina_yasi],
        'IL': ['Ankara'],
        'ILCE': [ilce],
        'Neighborhood': [mahalle],
        'Room_Numeric': [convert_room(oda_sayisi)]
    })

    prediction = xgb_pipeline.predict(input_data)[0]
    return prediction

ornek_tahmin = predict_price('Çankaya', 'Bahçelievler', '3+1', 120, 15)
print(f"\nÖrnek tahmin: {ornek_tahmin:.2f} TL")

joblib.dump(xgb_pipeline, 'emlak_xgboost_model.pkl')


print("\nModel ve veri seti kaydedildi.")

Doldurma sonrası Building Age eksik değer sayısı: 0
Room_Numeric eksik değer sayısı: 0

XGBoost Model Performansı:
RMSE: 7666.32
MAE: 5367.38
R²: 0.5590

Örnek tahmin: 28048.31 TL

Model ve veri seti kaydedildi.


In [36]:
df_filled=df_filled.rename(columns={"IL":"City","ILCE":"District"})

In [37]:
df_filled.head()

Unnamed: 0,Price (TL),Room,Size (m²),Building Age,City,District,Neighborhood,Room_Numeric
0,26000,3+1,120,17.0,Ankara,Altındağ,Aydınlıkevler Mah.,4
1,32000,3+1,135,47.0,Ankara,Çankaya,Çankaya Mah.,4
2,42000,3+1,135,35.0,Ankara,Çankaya,Kavaklıdere Mah.,4
3,55000,4+1,240,5.0,Ankara,Çankaya,Büyükesat Mah.,5
4,30000,2+1,95,21.0,Ankara,Çankaya,İlkadım Mah.,3


In [38]:
df_filled.to_csv('emlak-veri-egitime-hazir.csv', index=False)