# Age Prediction mit Regression
---

In [2]:
import pandas as pd
import nltk

import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Data preprocessing

In [3]:
df_train = pd.read_csv("train_reg.csv")
df_vali = pd.read_csv("vali_reg.csv")
df_test = pd.read_csv("test_reg.csv")

df_train.labels  = df_train.labels.apply(str)
df_vali.labels  = df_vali.labels.apply(str)
df_test.labels  = df_test.labels.apply(str)

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/constantin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "html.parser").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df_train['text'] = df_train['text'].apply(clean_text)
df_vali['text'] = df_vali['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)



In [6]:
x_train = df_train['text']
y_train = df_train['labels']
x_test = df_vali['text']
y_test = df_vali['labels']

## Preprocessing Pipeline

In [7]:
wordvecpipe = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),])

# Models Tests

In [11]:
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [12]:
RANDOM_SEED = 42

## Linear Support Vector Regression

In [56]:
from sklearn.svm import LinearSVR

In [58]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', LinearSVR(random_state=RANDOM_SEED))
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={
        'model__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        'model__max_iter': [1000,1500,500],
        'model__tol':[1e-4,1e-5,1e-3]
    }, 
    cv=10,
    n_jobs=4,
    verbose=0, # Für volle Information auf 4 ändern
    scoring='neg_mean_absolute_error',
    refit=True
)
grid.fit(x_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model', LinearSVR(random_state=42))]),
             n_jobs=4,
             param_grid={'model__loss': ['epsilon_insensitive',
                                         'squared_epsilon_insensitive'],
                         'model__max_iter': [1000, 1500, 500],
                         'model__tol': [0.0001, 1e-05, 0.001]},
             scoring='neg_mean_absolute_error')

In [59]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__loss': 'squared_epsilon_insensitive', 'model__max_iter': 1000, 'model__tol': 1e-05}
Best score: -9.535863286085455


In [60]:
y_pred = grid.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 8.746632022057026
R2: 0.09086570066623523


## Decision Tree Regressor

In [31]:
from sklearn.tree import DecisionTreeRegressor

In [32]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', DecisionTreeRegressor(random_state=RANDOM_SEED))
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={'model__max_depth': [2, 5, 10],
                'model__splitter' : ["best", "random"]
                               },
    cv=10,
    n_jobs=1,
    verbose=0,
    scoring='neg_mean_absolute_error',
    refit=True
)
grid.fit(x_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model',
                                        DecisionTreeRegressor(random_state=42))]),
             n_jobs=1,
             param_grid={'model__max_depth': [2, 5, 10],
                         'model__splitter': ['best', 'random']},
             scoring='neg_mean_absolute_error')

In [33]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__max_depth': 10, 'model__splitter': 'best'}
Best score: -10.198425903441077


In [34]:
y_pred = grid.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 9.214151171899976
R2: 0.05079540636522706


## Random Forest Regressor

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', RandomForestRegressor(random_state=RANDOM_SEED))
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={
                'model__max_depth':[2,5,10],
                'model__n_estimators':[200,300]
                },
    cv=5,
    n_jobs=2,
    verbose=4,
    scoring='neg_mean_absolute_error',
    refit=True
)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 2/5] END model__max_depth=2, model__n_estimators=200;, score=-12.900 total time=   8.8s
[CV 1/5] END model__max_depth=2, model__n_estimators=200;, score=-7.544 total time=   9.3s
[CV 3/5] END model__max_depth=2, model__n_estimators=200;, score=-10.314 total time=   8.2s
[CV 4/5] END model__max_depth=2, model__n_estimators=200;, score=-9.687 total time=   8.7s
[CV 5/5] END model__max_depth=2, model__n_estimators=200;, score=-14.704 total time=   8.5s
[CV 1/5] END model__max_depth=2, model__n_estimators=300;, score=-7.545 total time=  11.7s
[CV 2/5] END model__max_depth=2, model__n_estimators=300;, score=-12.903 total time=  10.0s
[CV 3/5] END model__max_depth=2, model__n_estimators=300;, score=-10.316 total time=  10.3s
[CV 4/5] END model__max_depth=2, model__n_estimators=300;, score=-9.689 total time=  10.8s
[CV 5/5] END model__max_depth=2, model__n_estimators=300;, score=-14.702 total time=  10.7s
[CV 1/5] END model__max_

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=2,
             param_grid={'model__max_depth': [2, 5, 10],
                         'model__n_estimators': [200, 300]},
             scoring='neg_mean_absolute_error', verbose=4)

In [13]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__max_depth': 10, 'model__n_estimators': 300}
Best score: -10.83743950589


In [15]:
y_pred = grid.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 9.181644862613082
R2: 0.07840671883118888


## Gradient Boosting Regressor

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

In [19]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', GradientBoostingRegressor(random_state=RANDOM_SEED,loss='absolute_error'))
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={'model__n_estimators': [2000, 1500],
                                'model__learning_rate' : [1e-2, 1e-25,1e-15],
                               },
    cv=5,
    n_jobs=2,
    verbose=4,
    scoring='neg_mean_absolute_error',
    refit=True
)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 2/5] END model__learning_rate=0.01, model__n_estimators=2000;, score=-15.374 total time= 2.3min
[CV 1/5] END model__learning_rate=0.01, model__n_estimators=2000;, score=-9.943 total time= 2.5min
[CV 3/5] END model__learning_rate=0.01, model__n_estimators=2000;, score=-10.950 total time= 2.7min
[CV 4/5] END model__learning_rate=0.01, model__n_estimators=2000;, score=-8.007 total time= 2.8min
[CV 1/5] END model__learning_rate=0.01, model__n_estimators=1500;, score=-9.943 total time= 2.0min
[CV 5/5] END model__learning_rate=0.01, model__n_estimators=2000;, score=-11.994 total time= 2.5min
[CV 2/5] END model__learning_rate=0.01, model__n_estimators=1500;, score=-15.374 total time= 1.9min
[CV 3/5] END model__learning_rate=0.01, model__n_estimators=1500;, score=-10.952 total time= 1.9min
[CV 4/5] END model__learning_rate=0.01, model__n_estimators=1500;, score=-8.007 total time= 1.9min
[CV 5/5] END model__learning_rate=0.01, mode

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model',
                                        GradientBoostingRegressor(loss='absolute_error',
                                                                  random_state=42))]),
             n_jobs=2,
             param_grid={'model__learning_rate': [0.01, 1e-25, 1e-15],
                         'model__n_estimators': [2000, 1500]},
             scoring='neg_mean_absolute_error', verbose=4)

In [20]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__learning_rate': 0.01, 'model__n_estimators': 2000}
Best score: -11.253634301522919


In [21]:
y_pred = grid.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 9.110828517691774
R2: -0.010039765900544007


## XGBRegressor

In [22]:
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [23]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', XGBRegressor())
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={'model__n_estimators': [2000, 2500],
                                'model__max_depth':[2,5,10]                          
                               },
    cv=5,
    n_jobs=3,
    verbose=4,
    scoring='neg_mean_absolute_error',
    refit=True
)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


[CV 3/5] END model__max_depth=2, model__n_estimators=2000;, score=-9.652 total time=  57.1s
[CV 1/5] END model__max_depth=2, model__n_estimators=2000;, score=-7.992 total time=  57.3s
[CV 2/5] END model__max_depth=2, model__n_estimators=2000;, score=-11.676 total time=  57.3s
[CV 4/5] END model__max_depth=2, model__n_estimators=2000;, score=-8.795 total time=  56.0s
[CV 5/5] END model__max_depth=2, model__n_estimators=2000;, score=-13.787 total time=  56.1s
[CV 1/5] END model__max_depth=2, model__n_estimators=2500;, score=-8.093 total time= 1.1min
[CV 3/5] END model__max_depth=2, model__n_estimators=2500;, score=-9.683 total time= 1.1min
[CV 2/5] END model__max_depth=2, model__n_estimators=2500;, score=-11.692 total time= 1.1min
[CV 4/5] END model__max_depth=2, model__n_estimators=2500;, score=-8.764 total time= 1.2min
[CV 5/5] END model__max_depth=2, model__n_estimators=2500;, score=-13.723 total time= 1.2min
[CV 1/5] END model__max_depth=5, model__n_estimators=2000;, score=-8.560 tot

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model',
                                        XGBRegressor(base_score=None,
                                                     booster=None,
                                                     colsample_bylevel=None,
                                                     colsample_bynode=None,
                                                     colsample_bytree=None,
                                                     enable_categorical=False,
                                                     gamma=None, gpu_id=None,
                                                     importance_type=N

In [24]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__max_depth': 2, 'model__n_estimators': 2000}
Best score: -10.380327932987623


In [25]:
y_pred = grid.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 8.785981851108497
R2: 0.12123050741159658


## Stacked Model 1

In [39]:
from mlxtend.regressor import StackingCVRegressor

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [40]:
ridge = Ridge()
lasso = Lasso()
rf = RandomForestRegressor(random_state=RANDOM_SEED, max_depth = 10, n_estimators= 300)

In [41]:
stack = StackingCVRegressor(regressors=(ridge, rf),
                            meta_regressor=lasso)

pipe = Pipeline([('prevec',wordvecpipe),('stackmodel',stack)])

In [42]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('prevec',
                 Pipeline(steps=[('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer())])),
                ('stackmodel',
                 StackingCVRegressor(meta_regressor=Lasso(),
                                     regressors=(Ridge(),
                                                 RandomForestRegressor(max_depth=10,
                                                                       n_estimators=300,
                                                                       random_state=42))))])

In [43]:
y_pred = pipe.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 8.584477018765524
R2: 0.15341670282035436


## Stacked Model 2

In [44]:
from sklearn.svm import LinearSVR

In [45]:
lin = LinearSVR(loss = 'squared_epsilon_insensitive', max_iter= 1000, tol= 1e-05)
xgb = XGBRegressor(max_depth= 2, n_estimators= 2000)
rfr = RandomForestRegressor(max_depth = 10, n_estimators = 300)

In [46]:
stack = StackingCVRegressor(regressors=(lin, xgb),
                            meta_regressor=rfr)

pipe = Pipeline([('prevec',wordvecpipe),('stackmodel',stack)])

In [47]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('prevec',
                 Pipeline(steps=[('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer())])),
                ('stackmodel',
                 StackingCVRegressor(meta_regressor=RandomForestRegressor(max_depth=10,
                                                                          n_estimators=300),
                                     regressors=(LinearSVR(loss='squared_epsilon_insensitive',
                                                           tol=1e-05),
                                                 XGBRegressor(base_score=None,
                                                              booster=None,
                                                              colsample_bylevel=None,
                                                              colsample_byno...
                                                              interaction_constraints=None,
                                                     

In [48]:
y_pred = pipe.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 8.523375070707397
R2: 0.1481616207210874


**Safe Model for Frontend**

In [55]:
from joblib import dump
#dump(pipe, 'reg_model.joblib')

['reg_model.joblib']

**Load Model**

In [13]:
import joblib

In [14]:
model_joblib = joblib.load('reg_model.joblib')

In [15]:
y_pred = model_joblib.predict(x_test)

print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

MAE: 8.523375070707397
R2: 0.1481616207210874


In [17]:
x_test[0]

'info found + 100 pages 45 mb pdf files wait untill team leader processed learns html'

In [20]:
model_joblib.predict([x_test[0]])[0]

40.350385318226245