# Warsztaty Python w Data Science

---

## Machine Learning - część 2 z 5. Walidacja krzyżowa. Regresja  

- ### Balansowanie próby
- ### Walidacja Krzyżowa
- ### Inżynieria wymiarów
- ### Regresja
---


## Proces nauczania w Machine Learning

1. Przygotowanie danych
2. Podział danych
3. Budowanie modelu
4. Test dokładności

https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [None]:
import pandas as pd
df = pd.read_csv('data/spam.csv', encoding='ISO-8859-1')

In [None]:
df.head()

In [None]:
df.rename(columns = {'v1':'class_label', 'v2':'message'}, inplace = True)
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
df

In [None]:
df['class_label'].value_counts()

In [None]:
import matplotlib.pyplot as ab
import numpy as np
ab.style.use("dark_background")
labels = ['ham', 'spam']
counts = [4825, 747]
ypos = np.arange(len(labels)) #converting text labels to numberic value, 0 and 1
ypos

In [None]:
ab.xticks(ypos, labels)
ab.xlabel("class label")
ab.ylabel("Frequency")
ab.title("# of spam and ham in dataset")
ab.bar(ypos, counts);

In [None]:
df['class_label'] = df['class_label'].apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
df.head()

In [None]:
df_class_0 = df[df['class_label']==0]
df_class_1 = df[df['class_label']==1]

In [None]:
df_class_0.shape

In [None]:
df_class_1.shape

## Robimy OVERSAMPLING (dokładamy do mniejszej klasy powielone wartości)
#### moglibyśmy zrobić UNDERSAMPLING (usuwamy z większej klasy)
#### albo wogóle dołożyć SYNTENTYCZNE dane ("sztuczne")

In [None]:
df_class_1_over = df_class_1.sample(df_class_0.shape[0], replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.class_label.value_counts())

df_test_over.class_label.value_counts().plot(kind='bar', title="# of spam and ham in dataset");

---
## Klasyfikacja niezbalansowanych zbiorów

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['message'], df['class_label'], test_size = 0.3, random_state = 0)
print('rows in test set: ' + str(x_test.shape))
print('rows in train set: ' + str(x_train.shape))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = x_train.tolist()

vectorizer = TfidfVectorizer(
input= data ,  
lowercase=True,      
stop_words='english' 
)
features_train_transformed = vectorizer.fit_transform(data)  
features_test_transformed  = vectorizer.transform(x_test) 
df_vectorized = pd.DataFrame(features_train_transformed.toarray(), columns = vectorizer.get_feature_names())

In [None]:
from sklearn.naive_bayes import MultinomialNB
# train the model
classifier = MultinomialNB()
classifier.fit(features_train_transformed, y_train)

In [None]:
print("classifier accuracy {:.2f}%".format(classifier.score(features_test_transformed, y_test) * 100))

In [None]:
labels = classifier.predict(features_test_transformed)
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
actual = y_test.tolist()
predicted = labels
results = confusion_matrix(actual, predicted)
print('Confusion Matrix :')
print(results)
print ('Accuracy Score :',accuracy_score(actual, predicted))
print ('Report : ')
print (classification_report(actual, predicted) )
score_2 = f1_score(actual, predicted, average = 'binary')
print('F-Measure: %.3f' % score_2)

In [None]:
import seaborn as sns
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                results.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     results.flatten()/np.sum(results)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(results, annot=labels, fmt='', cmap='Reds');

---
## Klasyfikacja zbalansowanych zbiorów

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_test_over['message'], df_test_over['class_label'], test_size = 0.3, random_state = 0)
print('rows in test set: ' + str(x_test.shape))
print('rows in train set: ' + str(x_train.shape))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = x_train.tolist()

vectorizer = TfidfVectorizer(
input= data ,  
lowercase=True,      
stop_words='english' 
)
features_train_transformed = vectorizer.fit_transform(data)  
features_test_transformed  = vectorizer.transform(x_test) 
df_vectorized = pd.DataFrame(features_train_transformed.toarray(), columns = vectorizer.get_feature_names())

In [None]:
from sklearn.naive_bayes import MultinomialNB
# train the model
classifier = MultinomialNB()
classifier.fit(features_train_transformed, y_train)

In [None]:
print("classifier accuracy {:.2f}%".format(classifier.score(features_test_transformed, y_test) * 100))

In [None]:
labels = classifier.predict(features_test_transformed)
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
actual = y_test.tolist()
predicted = labels
results = confusion_matrix(actual, predicted)
print('Confusion Matrix :')
print(results)
print ('Accuracy Score :',accuracy_score(actual, predicted))
print ('Report : ')
print (classification_report(actual, predicted) )
score_2 = f1_score(actual, predicted, average = 'binary')
print('F-Measure: %.3f' % score_2)

In [None]:
import seaborn as sns
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                results.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     results.flatten()/np.sum(results)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(results, annot=labels, fmt='', cmap='Reds');

---
## Walidacja krzyżowa

![Walidacja krzyżowa](img\xvi.png)

https://scikit-learn.org/stable/modules/cross_validation.html

In [None]:
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

x_train, x_test, y_train, y_test = train_test_split(df_test_over['message'], df_test_over['class_label'], test_size = 0.1, random_state = 0)
print('rows in test set: ' + str(x_test.shape))
print('rows in train set: ' + str(x_train.shape))
from sklearn.feature_extraction.text import TfidfVectorizer

data = x_train.tolist()

vectorizer = TfidfVectorizer(
input= data ,  
lowercase=True,      
stop_words='english' 
)
features_train_transformed = vectorizer.fit_transform(data)  
features_test_transformed  = vectorizer.transform(x_test) 
df_vectorized = pd.DataFrame(features_train_transformed.toarray(), columns = vectorizer.get_feature_names())


In [None]:
scores = cross_val_score(MultinomialNB(), features_train_transformed, y_train, cv=5)
print(list(scores))
print()
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

---
## Dobór estymatorów

![Dobór estymatorów](img\ml_map.png)

https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html


---



In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('data/adverts_29_04.csv', sep=';')
data

---
# Feature Engineering

## The features you use influence more than everything else the result. 
## No algorithm alone, to my knowledge, can supplement the information gain given by correct feature engineering.
## <div style="text-align: right">— Luca Massaron Autor, Kaggle master</div>

---

## Coming up with features is difficult, time-consuming, requires expert knowledge.
## "_*Applied machine learning*_" is basically feature engineering.
## <div style="text-align: right">— Andrew Ng</div>

---

In [None]:
data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']
data = data.dropna(subset=['cena_za_metr'])
df = data.drop(['Cena', 'Data dodania'], axis=1)
df

### Zmienne kategoryczne na indeksy

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

label_encoded = df

label_encoded['Lokalizacja_Cat'] = labelencoder.fit_transform(label_encoded['Lokalizacja'])
label_encoded

## __*One-hot encoding*__ zmiennych kategorycznych

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

enc_df = pd.DataFrame(enc.fit_transform(label_encoded[['Lokalizacja_Cat']]).toarray())

one_hot_data = label_encoded.join(enc_df)
one_hot_data

In [None]:
dum_df = pd.get_dummies(df, columns=['Lokalizacja'])
dum_df

In [None]:
import pandas as pd
from numpy import log2

data = pd.read_csv('data/adverts_29_04.csv', sep=';')
data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']
data["log"] = data['Wielkość (m2)'].apply(lambda x: log2(x))
data["clog"] = data['Cena'].apply(lambda x: log2(x))
data = data.dropna(subset=['cena_za_metr'])
df = data.drop(['Cena', 'Data dodania'], axis=1)
df

In [None]:
dum_df = pd.get_dummies(data, columns=['Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])
dum_df

In [None]:
dum_df.columns

In [None]:
dum_df.corr()['cena_za_metr']

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(20,15))
plt.style.use("dark_background")

sns.heatmap(dum_df.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);

---
# Regresja

In [None]:
from sklearn.linear_model import LinearRegression

y = dum_df['cena_za_metr']
X = dum_df.drop(['opis', 'Data dodania', 'Cena', 'cena_za_metr'], axis=1)

reg = LinearRegression().fit(X, y)

In [None]:
reg.score(X, y)

$
R^2
$

https://en.wikipedia.org/wiki/Coefficient_of_determination

Współczynnik determinacji - Jaka część wariancji zmiennej objaśnianej jest pochodzi od zmiennych tłumaczących

- 1.0 - Idealnie dopasowania
- 0.0 - Funkcja stała
- ... ale może być i ujemna

In [None]:
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)
scores = cross_val_score(LinearRegression(), X_train, y_train, cv=5)
print(list(scores))
print()
print("Mean square error: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))