In [74]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
import glob
import os

In [75]:
input_files = glob.glob("./data/apartments_rent_*.csv")
print(input_files)

['./data\\apartments_rent_pl_2023_11.csv', './data\\apartments_rent_pl_2023_12.csv', './data\\apartments_rent_pl_2024_01.csv', './data\\apartments_rent_pl_2024_02.csv', './data\\apartments_rent_pl_2024_03.csv', './data\\apartments_rent_pl_2024_04.csv']


In [76]:
# Function to filter rows with 'city' column equals to 'Krakow'
def filter_city(df):
    return df[df['city'] == 'krakow']

# Function to read CSV file, filter rows, and return filtered DataFrame
def process_csv(input_file):
    # Read CSV file into a DataFrame
    df = pd.read_csv(input_file)
    
    # Filter rows with 'city' column equals to 'Krakow'
    return filter_city(df)

# List to store filtered DataFrames
filtered_dfs = []

# Process each input CSV file and append filtered DataFrame to list
for input_file in input_files:
    filtered_dfs.append(process_csv(input_file))

# Concatenate all filtered DataFrames
result_df = pd.concat(filtered_dfs, ignore_index=True)

# Output CSV file
output_file = './data/total-krakow-data.csv'

# Write concatenated DataFrame to output CSV file
result_df.to_csv(output_file, index=False)

In [77]:
krakow_data = pd.read_csv("./data/total-krakow-data.csv")
X = krakow_data.drop(columns=["id","city","type","floor","floorCount","buildYear","latitude","longitude","poiCount","schoolDistance","clinicDistance","postOfficeDistance","kindergartenDistance","restaurantDistance","collegeDistance","pharmacyDistance","ownership","buildingMaterial","condition","hasParkingSpace","hasBalcony","hasElevator","hasSecurity","hasStorageRoom", "price"])
y = krakow_data["price"]

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2)
train_model = DecisionTreeClassifier()
train_model.fit(X_train, y_train)

In [79]:
train_prediction = train_model.predict(X_test)
train_prediction

array([1800, 2400, 1990, ..., 3100, 2600, 2400], dtype=int64)

In [80]:
score = accuracy_score(y_test, train_prediction)
score

0.4656781987918726

In [81]:
model = DecisionTreeClassifier()
model.fit(X.values, y) # Expecting squareMeters, rooms, centreDistance.

In [82]:
joblib.dump(model, 'model.joblib')

['model.joblib']