## Verilerin istatistiksel olarak incelenmesi ve aykırı değerlerin ayıklanması

Gerekli paket ve modüllerin yüklenmesi

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Verilerin yüklenmesi

In [None]:
df = pd.read_csv('data.csv')

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6858 entries, 0 to 6857
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          6858 non-null   object
 1   district      6858 non-null   object
 2   neighborhood  6858 non-null   object
 3   room          6858 non-null   int64 
 4   living_room   6858 non-null   int64 
 5   area          6858 non-null   int64 
 6   age           6858 non-null   int64 
 7   floor         6858 non-null   int64 
 8   price         6858 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 482.3+ KB
None


In [None]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6858 entries, 0 to 6857
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6858 non-null   category
 1   district      6858 non-null   category
 2   neighborhood  6858 non-null   category
 3   room          6858 non-null   int64   
 4   living_room   6858 non-null   int64   
 5   area          6858 non-null   int64   
 6   age           6858 non-null   int64   
 7   floor         6858 non-null   int64   
 8   price         6858 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 373.0 KB
None


Nümerik değişkenlerin minimum, maximum ve çeyreklik değerlerinin bulunması

In [8]:
columns = df.select_dtypes(include=[np.number]).columns
min_values = []
max_values = []
for column in columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_value = Q1 - 1.5 * IQR
    max_value = Q3 + 1.5 * IQR
    min_values.append(min_value)
    max_values.append(max_value)
    print(f"Column: {column}, min: {min_value}, max: {max_value}")

Column: room, min: 0.5, max: 4.5
Column: living_room, min: 1.0, max: 1.0
Column: area, min: -25.0, max: 231.0
Column: age, min: -20.0, max: 44.0
Column: floor, min: -2.0, max: 6.0
Column: price, min: -8750.0, max: 45250.0


Aykırı değerlerin temizlenmesi

In [11]:
for i, column in enumerate(columns):
    df = df[(df[column] >= min_values[i]) & (df[column] <= max_values[i])]

In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5644 entries, 0 to 6857
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          5644 non-null   category
 1   district      5644 non-null   category
 2   neighborhood  5644 non-null   category
 3   room          5644 non-null   int64   
 4   living_room   5644 non-null   int64   
 5   area          5644 non-null   int64   
 6   age           5644 non-null   int64   
 7   floor         5644 non-null   int64   
 8   price         5644 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 355.3 KB
None


In [13]:
print(df.describe())

              room  living_room         area          age        floor  \
count  5644.000000       5644.0  5644.000000  5644.000000  5644.000000   
mean      2.131113          1.0   101.877038    12.733345     2.233345   
std       0.812033          0.0    37.768240    10.560843     1.643361   
min       1.000000          1.0     5.000000     0.000000    -2.000000   
25%       2.000000          1.0    70.000000     4.000000     1.000000   
50%       2.000000          1.0   100.000000    10.000000     2.000000   
75%       3.000000          1.0   130.000000    20.000000     3.000000   
max       4.000000          1.0   230.000000    44.000000     6.000000   

              price  
count   5644.000000  
mean   16275.806166  
std     7928.146837  
min        1.000000  
25%    11000.000000  
50%    15000.000000  
75%    20000.000000  
max    45000.000000  


Kira fiyatı için elle düzeltme

In [14]:
df = df[df['price'] >= 3000]

In [16]:
df.to_csv('data_cleaned.csv', index=False)

In [17]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6116 entries, 23 to 8134
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int64   
 4   living_room   6116 non-null   int64   
 5   area          6116 non-null   int64   
 6   age           6116 non-null   int64   
 7   floor         6116 non-null   int64   
 8   price         6116 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 383.3 KB
None


In [19]:
print(df.describe())

              room  living_room         area          age        floor  \
count  6116.000000       6116.0  6116.000000  6116.000000  6116.000000   
mean      2.180020          1.0   104.830445    12.698169     2.198496   
std       0.826463          0.0    39.467687    10.465384     1.589161   
min       1.000000          1.0     5.000000     0.000000    -2.000000   
25%       2.000000          1.0    75.000000     4.000000     1.000000   
50%       2.000000          1.0   100.000000    10.000000     2.000000   
75%       3.000000          1.0   130.000000    20.000000     3.000000   
max       4.000000          1.0   240.000000    44.000000     6.000000   

              price  
count   6116.000000  
mean   18170.733976  
std    10323.229150  
min     3000.000000  
25%    11500.000000  
50%    15000.000000  
75%    21000.000000  
max    60000.000000  


In [20]:
df.to_csv('data_cleaned.csv', index=False)