### IMPORT 

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression   
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

### VERİYİ YÜKLEME 

In [6]:
df = pd.read_csv('C:\\Users\\merve\\Desktop\\staj_lotus_ai\\EtSut_ML_Projects\\03_regression_milk_quality\\data\\milknew.csv')

### KEŞİFÇİ VERİ ANALİZİ

In [7]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    #print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [8]:
check_df(df)

##################### Shape #####################
(1059, 8)
##################### Types #####################
pH            float64
Temprature      int64
Taste           int64
Odor            int64
Fat             int64
Turbidity       int64
Colour          int64
Grade          object
dtype: object
##################### Head #####################
    pH  Temprature  Taste  Odor  Fat   Turbidity  Colour Grade
0  6.6          35      1     0     1          0     254  high
1  6.6          36      0     1     0          1     253  high
2  8.5          70      1     1     1          1     246   low
##################### Tail #####################
       pH  Temprature  Taste  Odor  Fat   Turbidity  Colour Grade
1056  3.0          40      1     1     1          1     255   low
1057  6.8          43      1     0     1          0     250  high
1058  8.6          55      0     1     1          1     255   low
##################### NA #####################
pH            0
Temprature    0
Taste  

In [10]:
# 2. Kategorik Hedefi Sayısal Skora Dönüştürme (Regresyon için) [cite: 12]
# Low: 0, Medium: 1, High: 2
target_mapping = {'low': 0, 'medium': 1, 'high': 2}
df['Grade_num'] = df['Grade'].map(target_mapping)

In [11]:
df.head()

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour,Grade,Grade_num
0,6.6,35,1,0,1,0,254,high,2
1,6.6,36,0,1,0,1,253,high,2
2,8.5,70,1,1,1,1,246,low,0
3,9.5,34,1,1,0,1,255,low,0
4,6.6,37,0,0,0,0,255,medium,1


In [14]:
def grab_col_names(dataframe, cat_th=10, car_th=25):
    """
    grab_col_names for given dataframe

    :param dataframe:
    :param cat_th:
    :param car_th:
    :return:
    """

    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, cat_but_car, num_cols

In [15]:
cat_cols, cat_but_car, num_cols = grab_col_names(df)    

Observations: 1059
Variables: 9
cat_cols: 7
num_cols: 2
cat_but_car: 0
num_but_cat: 6


### AYKIRI DEĞERLER 

In [12]:
def outlier_thresholds(dataframe, variable, low_quantile=0.10, up_quantile=0.90):
    quantile_one = dataframe[variable].quantile(low_quantile)
    quantile_three = dataframe[variable].quantile(up_quantile)
    interquantile_range = quantile_three - quantile_one
    up_limit = quantile_three + 1.5 * interquantile_range
    low_limit = quantile_one - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [13]:
for col in df.columns:
    if col != 'Grade' and col != 'Grade_num':
        print(col, check_outlier(df, col))
      
      

pH False
Temprature True
Taste False
Odor False
Fat  False
Turbidity False
Colour False


In [16]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


for col in num_cols:
        replace_with_thresholds(df,col)

  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit


In [17]:
for col in num_cols:
        print(col, check_outlier(df, col))

pH False
Temprature False


In [18]:
print(f"Yeni veri seti boyutu: {df.shape}")

Yeni veri seti boyutu: (1059, 9)


### TRAIN TEST SPLIT

In [19]:
X = df.drop(['Grade', 'Grade_num'], axis=1)
y = df['Grade_num']

# 4. Veri Bölümleme (%75 Eğitim, %25 Test - KNIME ile aynı oran) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Normalizasyon

In [20]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)

### MODEL

In [22]:
# Model 1: Linear Regression [cite: 12]
lr_model = LinearRegression()
lr_model.fit(X_train_std, y_train)

# Model 2: Random Forest + Parametre Optimizasyonu (AutoML) 
rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# En uygun algoritma parametrelerinin seçilmesi 
auto_ml_rf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=5, scoring='r2')
auto_ml_rf.fit(X_train_std, y_train)

### DEĞERLENDİRME

In [23]:
from sklearn.metrics import mean_squared_error, r2_score

# Tahminler
lr_preds = lr_model.predict(X_test_std)
rf_preds = auto_ml_rf.predict(X_test_std)

# Sonuçların Karşılaştırılması 
results = {
    "Linear Regression": {
        "R2": r2_score(y_test, lr_preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, lr_preds))
    },
    "Random Forest (Optimized)": {
        "R2": r2_score(y_test, rf_preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, rf_preds))
    }
}

print(pd.DataFrame(results).T)

                                 R2      RMSE
Linear Regression          0.230434  0.675462
Random Forest (Optimized)  0.973943  0.124291
