# Tenis
#### Table of Contents
1. [Importing Libraries and Data](#Importing-Libraries-and-Data)
2. [Data Preprocessing](#Data-Preprocessing)
   1. [Encoding Non-Numeric Values](#Encoding-Non-Numeric-Values)
   2. [Test Train Split](#Test-Train-Split)
   3. [Scaling Values](#Scaling-Values)
3. [Training the Model](#Training-the-Model)
4. [Using P-Values to Improve the Model](#Using-P-Values-to-Improve-the-Model)
   1. [Backwards Elimination](#Backwards-Elimination)

### Importing Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
raw_import = pd.read_csv("ödev.csv")
raw_import

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes
5,rainy,65,70,True,no
6,overcast,64,65,True,yes
7,sunny,72,95,False,no
8,sunny,69,70,False,yes
9,rainy,75,80,False,yes


In [4]:
raw_import.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   outlook      14 non-null     object
 1   temperature  14 non-null     int64 
 2   humidity     14 non-null     int64 
 3   windy        14 non-null     bool  
 4   play         14 non-null     object
dtypes: bool(1), int64(2), object(2)
memory usage: 594.0+ bytes


In [5]:
raw_import[["outlook","play"]].nunique()

outlook    3
play       2
dtype: int64

### Data Preprocessing

#### Encoding Non-Numeric Values

In [6]:
raw_import.iloc[:,[0,3,4]]

Unnamed: 0,outlook,windy,play
0,sunny,False,no
1,sunny,True,no
2,overcast,False,yes
3,rainy,False,yes
4,rainy,False,yes
5,rainy,True,no
6,overcast,True,yes
7,sunny,False,no
8,sunny,False,yes
9,rainy,False,yes


In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
ohe = preprocessing.OneHotEncoder()

In [8]:
quicker_way = raw_import.iloc[:,[0,3,4]].apply(preprocessing.LabelEncoder().fit_transform)
quicker_way

Unnamed: 0,outlook,windy,play
0,2,0,0
1,2,1,0
2,0,0,1
3,1,0,1
4,1,0,1
5,1,1,0
6,0,1,1
7,2,0,0
8,2,0,1
9,1,0,1


In [9]:
numerical_columns = raw_import[["temperature","humidity"]]
numerical_columns

Unnamed: 0,temperature,humidity
0,85,85
1,80,90
2,83,86
3,70,96
4,68,80
5,65,70
6,64,65
7,72,95
8,69,70
9,75,80


In [10]:
raw_import["windy"] = le.fit_transform(raw_import["windy"].values)
raw_import["play"] = le.fit_transform(raw_import["play"].values)

In [11]:
raw_import

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,0,0
1,sunny,80,90,1,0
2,overcast,83,86,0,1
3,rainy,70,96,0,1
4,rainy,68,80,0,1
5,rainy,65,70,1,0
6,overcast,64,65,1,1
7,sunny,72,95,0,0
8,sunny,69,70,0,1
9,rainy,75,80,0,1


In [12]:
outlook_column = raw_import.iloc[:,0:1].values
outlook_column

array([['sunny'],
       ['sunny'],
       ['overcast'],
       ['rainy'],
       ['rainy'],
       ['rainy'],
       ['overcast'],
       ['sunny'],
       ['sunny'],
       ['rainy'],
       ['sunny'],
       ['overcast'],
       ['overcast'],
       ['rainy']], dtype=object)

In [13]:
outlook_column[:,0] = le.fit_transform(raw_import.iloc[:,0])
outlook_column

array([[2],
       [2],
       [0],
       [1],
       [1],
       [1],
       [0],
       [2],
       [2],
       [1],
       [2],
       [0],
       [0],
       [1]], dtype=object)

In [14]:
outlook_column = ohe.fit_transform(outlook_column).toarray()
outlook_column

array([[0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [15]:
outlookDF = pd.DataFrame(data=outlook_column, index=range(len(outlook_column)), columns=["overcast","rainy","sunny"])
raw_import.drop(columns=["outlook"],inplace=True)
valuesDF = pd.concat([outlookDF, raw_import], axis=1)
valuesDF

Unnamed: 0,overcast,rainy,sunny,temperature,humidity,windy,play
0,0.0,0.0,1.0,85,85,0,0
1,0.0,0.0,1.0,80,90,1,0
2,1.0,0.0,0.0,83,86,0,1
3,0.0,1.0,0.0,70,96,0,1
4,0.0,1.0,0.0,68,80,0,1
5,0.0,1.0,0.0,65,70,1,0
6,1.0,0.0,0.0,64,65,1,1
7,0.0,0.0,1.0,72,95,0,0
8,0.0,0.0,1.0,69,70,0,1
9,0.0,1.0,0.0,75,80,0,1


#### Test Train Split

In [16]:
x = valuesDF.iloc[:,0:-1]
x

Unnamed: 0,overcast,rainy,sunny,temperature,humidity,windy
0,0.0,0.0,1.0,85,85,0
1,0.0,0.0,1.0,80,90,1
2,1.0,0.0,0.0,83,86,0
3,0.0,1.0,0.0,70,96,0
4,0.0,1.0,0.0,68,80,0
5,0.0,1.0,0.0,65,70,1
6,1.0,0.0,0.0,64,65,1
7,0.0,0.0,1.0,72,95,0
8,0.0,0.0,1.0,69,70,0
9,0.0,1.0,0.0,75,80,0


In [17]:
y = valuesDF.iloc[:,-1:]
y

Unnamed: 0,play
0,0
1,0
2,1
3,1
4,1
5,0
6,1
7,0
8,1
9,1


In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33,random_state=31)

#### Scaling Values

In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [20]:
X_train = sc.fit_transform(x_train)
X_test = sc.fit_transform(x_test)

### Training the Model

In [21]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

In [22]:
model_prediction = lr.predict(X_test)

In [24]:
modelDF = pd.DataFrame(data=model_prediction, index=range(5), columns=["Predicted Values"])
y_test.reset_index(inplace=True, drop=True)
y_test.rename(columns={"play":"Real Values"}, inplace=True)
comparisonDF = pd.concat([modelDF,y_test],axis=1)

In [25]:
comparisonDF.drop(columns=["index"], inplace=True)

In [26]:
comparisonDF

Unnamed: 0,Predicted Values,Real Values
0,0.451274,1
1,0.049991,0
2,0.429121,1
3,1.136813,1
4,1.266134,0


### Using P-Values to Improve the Model

#### Backwards Elimination

In [27]:
X = np.append(arr=np.ones((14,1)).astype(int),values=x, axis=1)

In [28]:
import statsmodels.api as sm

In [29]:
X_l = x.iloc[:,[0,1,2,3,4,5]].values
X_l = np.array(X_l, dtype=float)
model = sm.OLS(y,X_l).fit()
model.summary()

0,1,2,3
Dep. Variable:,play,R-squared:,0.483
Model:,OLS,Adj. R-squared:,0.16
Method:,Least Squares,F-statistic:,1.493
Date:,"Sat, 14 Dec 2024",Prob (F-statistic):,0.292
Time:,14:57:11,Log-Likelihood:,-4.9501
No. Observations:,14,AIC:,21.9
Df Residuals:,8,BIC:,25.73
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,3.4007,1.859,1.829,0.105,-0.887,7.688
x2,2.9474,1.754,1.680,0.131,-1.098,6.993
x3,2.8194,1.878,1.501,0.172,-1.512,7.151
x4,-0.0143,0.025,-0.569,0.585,-0.072,0.044
x5,-0.0142,0.014,-1.031,0.333,-0.046,0.018
x6,-0.4108,0.268,-1.533,0.164,-1.029,0.207

0,1,2,3
Omnibus:,0.118,Durbin-Watson:,1.317
Prob(Omnibus):,0.943,Jarque-Bera (JB):,0.34
Skew:,0.035,Prob(JB):,0.844
Kurtosis:,2.24,Cond. No.,2860.0


In [30]:
X_l = x.iloc[:,[0,1,2,4,5]].values
X_l = np.array(X_l, dtype=float)
model = sm.OLS(y,X_l).fit()
model.summary()

0,1,2,3
Dep. Variable:,play,R-squared:,0.462
Model:,OLS,Adj. R-squared:,0.223
Method:,Least Squares,F-statistic:,1.931
Date:,"Sat, 14 Dec 2024",Prob (F-statistic):,0.189
Time:,14:57:12,Log-Likelihood:,-5.228
No. Observations:,14,AIC:,20.46
Df Residuals:,9,BIC:,23.65
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,2.5317,1.021,2.480,0.035,0.222,4.841
x2,2.1712,1.062,2.045,0.071,-0.231,4.573
x3,1.9472,1.045,1.863,0.095,-0.417,4.311
x4,-0.0171,0.012,-1.399,0.195,-0.045,0.011
x5,-0.3586,0.242,-1.481,0.173,-0.906,0.189

0,1,2,3
Omnibus:,0.557,Durbin-Watson:,1.461
Prob(Omnibus):,0.757,Jarque-Bera (JB):,0.554
Skew:,-0.005,Prob(JB):,0.758
Kurtosis:,2.025,Cond. No.,1250.0
