# Machine Learning 2

https://github.com/MichalKorzycki/WarsztatPythonDataScience

Plik: `MachineLearning2.ipynb`




#### - Łańcuchy przetwarzania - Pipelines w sklearn
#### - Charakterystyka danych tekstowych
#### - Przestrzeń do rozwoju
#### - Trenowanie, walidacja i testowanie modeli
- Proces budowy modelu
- Walidacja krzyżowa
---




https://www.gumtree.pl/a-mieszkania-i-domy-sprzedam-i-kupie/praga-polnoc/mieszkanie-inwestycyjne-4+pok-przy-metrze-wilenska-targowa-70/1007172232370910500042709

---

In [None]:
import pandas as pd
from numpy import log2

data = pd.read_csv('adverts_29_04.csv', sep=';')
data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']
data = data.dropna(subset=['cena_za_metr'])
df = data.drop(['Cena', 'Data dodania'], axis=1)
dum_df = pd.get_dummies(df, columns=['Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])
dum_df

In [None]:
from sklearn.linear_model import LinearRegression

y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)

reg = LinearRegression().fit(X, y)
reg.score(X,y)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

pipeline = Pipeline(
[ ('scaler', StandardScaler()),  ('linear', LinearRegression()) ]
)

In [None]:

y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)

reg = pipeline.fit(X, y)
reg.score(X,y)

---

## Co z opisem ?

In [None]:
import gzip
import sys
import re

f = gzip.open('odm.txt.gz', 'rt', encoding='utf-8')
dictionary = {}

for x in f:
    t = x.strip().split(',')
    tt = [ x.strip().lower() for x in t]
    for w in tt[1:]: 
        dictionary[w]=tt[0]

def lematize(w):
    w = w.replace('ą','ą')
    w = w.replace('ó','ó')
    w = w.replace('ę','ę')
    w = w.replace('ż','ż')
    return dictionary.get(w,w)

opis1 = dum_df['opis'][0]

In [None]:
import re

splitter = re.compile(r'[^ąąćęńłóóśśżżź\w]+')
isnumber = re.compile(r'[0-9]')


def preprocessing(opis):
    opis = str(opis)
    tokenized = splitter.split(opis)
    l = list(tokenized)
    l = [ x.lower() for x in l ]
    l = [ x for x in l if isnumber.search(x) is None ]
    l = [ lematize(x) for x in l ]
    return l

In [None]:
raw_corpus=[]
n=0

for i in dum_df.iterrows():
    n+=1
    l = list(splitter.split(i[1][1]))
    raw_corpus.append(l)

    
all_words = []
for t in raw_corpus:
    all_words[0:0] = t

print(f'Słów: {len(all_words)} z {n} dokumentów')
    
words = {}
for w in all_words:
    rec = words.get(w.lower(), {'upper':0, 'lower': 0})
    if w.lower()==w or w.upper()==w:
        rec['lower'] = rec['lower'] +1
    else: 
        rec['upper'] = rec['upper'] +1
    words[w.lower()] = rec

print(len(words))

raw_stop_words = [ x for x in words.keys() if words[x]['upper']>=words[x]['lower']*8 ]   
print(len(raw_stop_words))
print(raw_stop_words[:100])



In [None]:
import re

splitter = re.compile(r'[^ąąćęńłóóśśżżź\w]+')
isnumber = re.compile(r'[0-9]')

set_raw_stop_words = set(raw_stop_words)

def preprocessing(opis):
    opis = str(opis)
    tokenized = splitter.split(opis)
    l = list(tokenized)
    l = [ x.lower() for x in l ]
    l = [ x for x in l if len(x) > 2]
    l = [ x for x in l if isnumber.search(x) is None ]
    l = [ x for x in l if x not in set_raw_stop_words ]
    l = [ lematize(x) for x in l ]
    l = [ x for x in l if len(x) > 2]
    return l

In [None]:
opis1

In [None]:
print(preprocessing(opis1))

In [None]:
corpus=[]
for i in dum_df.iterrows():
    l = preprocessing(i[1][1])
    corpus.append(l)

print(f"Mamy {len(corpus)} tekstów")

all_words = []
for t in corpus:
    all_words += t
 
print(f"Mamy {len(all_words)} wyrazów")
all_words[:15]

In [None]:
counter = {}

for w in all_words:
    counter[w] = counter.get(w,0)+1

print(f"Mamy {len(counter.keys())} RÓŻNYCH wyrazów")
counted_words= [ (word,cnt) for word,cnt in counter.items() ]
counted_words[:4]

In [None]:
from operator import itemgetter

counted_words.sort(key=itemgetter(1), reverse=True)
counted_words[:10]

In [None]:
counts = [ x[1] for x in counted_words ]

In [None]:
len(counts)

In [None]:
sum(counts)

In [None]:
sum(counts[:140])

In [None]:
count_df = pd.DataFrame(counts[:140])
count_df

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.dates as mdates

plt.figure(figsize=(24,12))
plt.style.use("dark_background")

chart = sns.scatterplot(
                     color='purple', 
                     data=count_df
                    )

## Metryka TF-IDF
ile razy występuję wyraz *i* w tekście *j*
$${n}_{ij}$$ 
 ### Term Frequency (TF)
 
 $${tf}_{ij} = \frac{{n}_{ij}}{\sum{k}{{n}_{ik}}}$$
 
 W tekście *j* sprawdzamy ile proporcjonalnie do całości występuje wyraz *i*
### Inverted Document Frequency (IDF)

 $$idf_i = log \frac{|D|}{ \{ d: n_i \in d \}}$$
 
 licznik - liczba dokumentów
 
 mianownik - liczba dokumentów w którym wystapił wyraz *i*-ty 

## Pipeline dla tekstu

In [None]:
dum_df["opis"] = dum_df["opis"].apply(lambda x: ' '.join(preprocessing(x)))

In [None]:
dum_df

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


pipeline = Pipeline([
                ('selector', ItemSelector(key='opis')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=120)),
                ('linear', LinearRegression())
            ])

In [None]:
y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr'], axis=1)

reg = pipeline.fit(X, y)

reg.score(X,y)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

                              
class ItemUnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict.drop([self.key], axis=1)


pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the post's subject line
            ('table', Pipeline([
                ('selector', ItemUnSelector(key='opis')),
            ])),

            # Pipeline for standard bag-of-words model for body
            ('description', Pipeline([
                ('selector', ItemSelector(key='opis')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=120)),
            ]))
        ],

        # weight components in FeatureUnion
        transformer_weights={
            'table': 1.0,
            'description': 1.0,
        },
    )),

    # Use a SVC classifier on the combined features
    ('linear', LinearRegression())
])

In [None]:
y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr'], axis=1)

reg = pipeline.fit(X, y)

reg.score(X,y)

---
## Przestrzeń do rozwoju
- Feature engineering

In [None]:
import pandas as pd
from numpy import log2
from sklearn.linear_model import LinearRegression

data = pd.read_csv('adverts_29_04.csv', sep=';')
data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']
data = data.dropna(subset=['cena_za_metr'])
df = data.drop(['Cena', 'Data dodania'], axis=1)
dum_df = pd.get_dummies(df, columns=['Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])


y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)

reg = LinearRegression().fit(X, y)
reg.score(X,y)

In [None]:
import pandas as pd
from numpy import log2
from sklearn.linear_model import LinearRegression

data = pd.read_csv('adverts_29_04.csv', sep=';')
data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']
data["log"] = data['Wielkość (m2)'].apply(lambda x: log2(x))
data = data.dropna(subset=['cena_za_metr'])
df = data.drop(['Cena', 'Data dodania'], axis=1)
dum_df = pd.get_dummies(df, columns=['Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])


y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)

reg = LinearRegression().fit(X, y)
reg.score(X,y)



- Optymalizacja hiper parametrów

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


pipeline = Pipeline([
                ('selector', ItemSelector(key='opis')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=120)),
                ('linear', LinearRegression())
            ])

y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr'], axis=1)

reg = pipeline.fit(X, y)

reg.score(X,y)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


pipeline = Pipeline([
                ('selector', ItemSelector(key='opis')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=250)),
                ('linear', LinearRegression())
            ])

y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr'], axis=1)

reg = pipeline.fit(X, y)

reg.score(X,y)

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

pipeline = Pipeline(
[ ('scaler', Normalizer()),  ('linear', LinearRegression()) ]
)

y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)

reg = pipeline.fit(X, y)
reg.score(X,y)

... ale to na kolejnym spotkaniu

---
## Trenowanie, testowanie, walidacja

- Bład który popełniamy

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(10,6))
plt.style.use("dark_background")
x = np.linspace(-2, 2, 100)
plt.plot(x, x+0.6*x*x)
plt.plot(x, 1.3*x)
x = np.linspace(-2, 2, 10)
plt.scatter(x, x+0.5*np.abs(x));

* Oddzielmy trenowanie od walidacji

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test,y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test,y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
reg = LinearRegression(normalize=True).fit(X_train, y_train)
reg.score(X_test,y_test)

---
## Trenowanie, walidacja i testowanie modeli

### Proces budowy modelu

1. Dzielimy dane na zbiór _**trenujący**_ (np. 75%), zbiór  _**walidacyjny**_  (np. 15%), zbiór _**testowy**_ (np. 10%)
2. Trenujemy różne modele na zbiorze _**trenującym**_
3. Oceniamy modele na zbiorze _**walidacyjnym**_
4. Wybieramy najlepszy
5. Skuteczność podajemy na zbiorze  _**testowym**_

---
### Walidacja krzyżowa



![Walidacja krzyżowa](xvi.png)

https://scikit-learn.org/stable/modules/cross_validation.html

In [None]:
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.linear_model import LinearRegression


y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr', 'opis'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
scores = cross_val_score(LinearRegression(), X_train, y_train, cv=5)
print(list(scores))
print()
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))