# Age Prediction mit Klassifizierung
---

In [1]:
import pandas as pd
import nltk

import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Data preprocessing

In [2]:
df_train = pd.read_csv("agetrain.csv")
df_vali = pd.read_csv("agevali.csv")
df_test = pd.read_csv("agetest.csv")

df_train.labels  = df_train.labels.apply(str)
df_vali.labels  = df_vali.labels.apply(str)
df_test.labels  = df_test.labels.apply(str)

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/constantin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "html.parser").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df_train['text'] = df_train['text'].apply(clean_text)
df_vali['text'] = df_vali['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)



In [5]:
x_train = df_train['text']
y_train = df_train['labels']
x_test = df_vali['text']
y_test = df_vali['labels']

## Preprocessing Pipeline

In [6]:
wordvecpipe = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),])

# Models Tests

In [7]:
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [8]:
RANDOM_SEED = 42

### Naives Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB

In [10]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', MultinomialNB())
])

pipe.fit(x_train, y_train)

Pipeline(steps=[('prevec',
                 Pipeline(steps=[('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer())])),
                ('model', MultinomialNB())])

In [11]:
y_pred = pipe.predict(x_test)

print(f'Acc: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred,target_names=y_train.unique()))

Acc: 0.19621913580246914
              precision    recall  f1-score   support

           5       0.30      0.38      0.34      1620
           1       0.21      0.35      0.26      1620
           3       0.13      0.13      0.13      1620
           2       0.16      0.16      0.16      1620
           0       0.15      0.10      0.12      1620
           4       0.17      0.09      0.12      1620
           7       0.24      0.17      0.20      1620
           6       0.16      0.18      0.17      1620

    accuracy                           0.20     12960
   macro avg       0.19      0.20      0.19     12960
weighted avg       0.19      0.20      0.19     12960



### SGDClassifier

In [10]:
from sklearn.linear_model import SGDClassifier

In [29]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', SGDClassifier(random_state=RANDOM_SEED))
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={
                'model__loss' : ['modified_huber'], # unnötige gelöscht nach durchführung des gridsearchs mit allen loss
                'model__penalty' : ['elasticnet'], # ebenfall unnötige gelöscht
                'model__l1_ratio':[0.1,0.15]
                },
    cv=5,
    n_jobs=3,
    verbose=4,
    scoring='accuracy',
    refit=True
)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END model__l1_ratio=0.1, model__loss=modified_huber, model__penalty=elasticnet;, score=0.187 total time=   4.3s
[CV 3/5] END model__l1_ratio=0.1, model__loss=modified_huber, model__penalty=elasticnet;, score=0.253 total time=   4.4s
[CV 2/5] END model__l1_ratio=0.1, model__loss=modified_huber, model__penalty=elasticnet;, score=0.195 total time=   4.5s
[CV 4/5] END model__l1_ratio=0.1, model__loss=modified_huber, model__penalty=elasticnet;, score=0.274 total time=   4.7s
[CV 5/5] END model__l1_ratio=0.1, model__loss=modified_huber, model__penalty=elasticnet;, score=0.223 total time=   4.8s
[CV 1/5] END model__l1_ratio=0.15, model__loss=modified_huber, model__penalty=elasticnet;, score=0.185 total time=   4.8s
[CV 2/5] END model__l1_ratio=0.15, model__loss=modified_huber, model__penalty=elasticnet;, score=0.193 total time=   4.0s
[CV 4/5] END model__l1_ratio=0.15, model__loss=modified_huber, model__penalty=elasticnet;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model',
                                        SGDClassifier(random_state=42))]),
             n_jobs=3,
             param_grid={'model__l1_ratio': [0.1, 0.15],
                         'model__loss': ['modified_huber'],
                         'model__penalty': ['elasticnet']},
             scoring='accuracy', verbose=4)

In [30]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__l1_ratio': 0.1, 'model__loss': 'modified_huber', 'model__penalty': 'elasticnet'}
Best score: 0.22621527777777778


In [31]:
y_pred = grid.predict(x_test)

print(f'Acc: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred,target_names=y_train.unique()))

Acc: 0.1882716049382716
              precision    recall  f1-score   support

           5       0.29      0.38      0.33      1620
           1       0.21      0.23      0.22      1620
           3       0.14      0.14      0.14      1620
           2       0.15      0.14      0.15      1620
           0       0.15      0.17      0.16      1620
           4       0.16      0.12      0.14      1620
           7       0.21      0.20      0.21      1620
           6       0.16      0.12      0.13      1620

    accuracy                           0.19     12960
   macro avg       0.18      0.19      0.18     12960
weighted avg       0.18      0.19      0.18     12960



### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

In [34]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', LogisticRegression(random_state=RANDOM_SEED))
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={'model__dual': [True,False],
                'model__penalty' : ['l2', 'l1', 'elasticnet'],
                'model__max_iter' : [100,500,1000]
                },
    cv=5,
    n_jobs=3,
    verbose=4,
    scoring='accuracy',
    refit=True
)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END model__dual=True, model__max_iter=100, model__penalty=l2;, score=nan total time=   0.8s
[CV 3/5] END model__dual=True, model__max_iter=100, model__penalty=l2;, score=nan total time=   0.8s
[CV 2/5] END model__dual=True, model__max_iter=100, model__penalty=l2;, score=nan total time=   0.9s
[CV 4/5] END model__dual=True, model__max_iter=100, model__penalty=l2;, score=nan total time=   0.8s
[CV 5/5] END model__dual=True, model__max_iter=100, model__penalty=l2;, score=nan total time=   0.8s
[CV 1/5] END model__dual=True, model__max_iter=100, model__penalty=l1;, score=nan total time=   0.8s
[CV 2/5] END model__dual=True, model__max_iter=100, model__penalty=l1;, score=nan total time=   0.8s
[CV 4/5] END model__dual=True, model__max_iter=100, model__penalty=l1;, score=nan total time=   0.7s
[CV 3/5] END model__dual=True, model__max_iter=100, model__penalty=l1;, score=nan total time=   0.8s
[CV 5/5] END model__dual=True,

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END model__dual=False, model__max_iter=100, model__penalty=l2;, score=0.190 total time=  16.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END model__dual=False, model__max_iter=100, model__penalty=l2;, score=0.256 total time=  16.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END model__dual=False, model__max_iter=100, model__penalty=l2;, score=0.191 total time=  16.7s
[CV 1/5] END model__dual=False, model__max_iter=100, model__penalty=l1;, score=nan total time=   0.8s
[CV 2/5] END model__dual=False, model__max_iter=100, model__penalty=l1;, score=nan total time=   1.1s
[CV 3/5] END model__dual=False, model__max_iter=100, model__penalty=l1;, score=nan total time=   1.1s
[CV 4/5] END model__dual=False, model__max_iter=100, model__penalty=l1;, score=nan total time=   1.0s
[CV 5/5] END model__dual=False, model__max_iter=100, model__penalty=l1;, score=nan total time=   1.3s
[CV 1/5] END model__dual=False, model__max_iter=100, model__penalty=elasticnet;, score=nan total time=   1.4s
[CV 2/5] END model__dual=False, model__max_iter=100, model__penalty=elasticnet;, score=nan total time=   2.0s
[CV 3/5] END model__dual=False, model__max_iter=100, model__penalty=elasticnet;, score=nan total time=   1.6s
[CV 4/5] END model__dual=False, model__max_iter=100, mod

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END model__dual=False, model__max_iter=100, model__penalty=l2;, score=0.281 total time=  13.5s
[CV 5/5] END model__dual=False, model__max_iter=100, model__penalty=elasticnet;, score=nan total time=   1.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END model__dual=False, model__max_iter=100, model__penalty=l2;, score=0.228 total time=  15.4s
[CV 1/5] END model__dual=False, model__max_iter=500, model__penalty=l2;, score=0.191 total time=  27.7s
[CV 2/5] END model__dual=False, model__max_iter=500, model__penalty=l2;, score=0.190 total time=  31.8s
[CV 3/5] END model__dual=False, model__max_iter=500, model__penalty=l2;, score=0.257 total time=  30.3s
[CV 1/5] END model__dual=False, model__max_iter=500, model__penalty=l1;, score=nan total time=   0.9s
[CV 2/5] END model__dual=False, model__max_iter=500, model__penalty=l1;, score=nan total time=   1.0s
[CV 3/5] END model__dual=False, model__max_iter=500, model__penalty=l1;, score=nan total time=   1.1s
[CV 4/5] END model__dual=False, model__max_iter=500, model__penalty=l1;, score=nan total time=   1.0s
[CV 5/5] END model__dual=False, model__max_iter=500, model__penalty=l1;, score=nan total time=   1.1s
[CV 1/5] END model__dual=False, model__max_iter=500, model__penalty=elasti

75 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 452

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model',
                                        LogisticRegression(random_state=42))]),
             n_jobs=3,
             param_grid={'model__dual': [True, False],
                         'model__max_iter': [100, 500, 1000],
                         'model__penalty': ['l2', 'l1', 'elasticnet']},
             scoring='accuracy', verbose=4)

In [35]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__dual': False, 'model__max_iter': 100, 'model__penalty': 'l2'}
Best score: 0.22901234567901235


In [36]:
y_pred = grid.predict(x_test)

print(f'Acc: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred,target_names=y_train.unique()))

Acc: 0.19027777777777777
              precision    recall  f1-score   support

           5       0.32      0.34      0.33      1620
           1       0.22      0.26      0.24      1620
           3       0.15      0.15      0.15      1620
           2       0.15      0.16      0.16      1620
           0       0.14      0.16      0.15      1620
           4       0.16      0.13      0.15      1620
           7       0.21      0.21      0.21      1620
           6       0.15      0.10      0.12      1620

    accuracy                           0.19     12960
   macro avg       0.19      0.19      0.19     12960
weighted avg       0.19      0.19      0.19     12960



### Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
pipe = Pipeline([
    ('prevec',wordvecpipe),
    ('model', RandomForestClassifier(random_state=RANDOM_SEED))
])

grid = GridSearchCV(
    estimator=pipe, 
    param_grid={
                'model__max_depth':[2,5,10],
                'model__n_estimators':[200,300,2000]
                },
    cv=5,
    n_jobs=3,
    verbose=4,
    scoring='accuracy',
    refit=True
)
grid.fit(x_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END model__max_depth=2, model__n_estimators=200;, score=0.166 total time=   2.6s
[CV 2/5] END model__max_depth=2, model__n_estimators=200;, score=0.160 total time=   2.6s
[CV 3/5] END model__max_depth=2, model__n_estimators=200;, score=0.230 total time=   2.6s
[CV 4/5] END model__max_depth=2, model__n_estimators=200;, score=0.223 total time=   2.4s
[CV 5/5] END model__max_depth=2, model__n_estimators=200;, score=0.164 total time=   2.4s
[CV 1/5] END model__max_depth=2, model__n_estimators=300;, score=0.166 total time=   3.1s
[CV 2/5] END model__max_depth=2, model__n_estimators=300;, score=0.160 total time=   3.1s
[CV 3/5] END model__max_depth=2, model__n_estimators=300;, score=0.228 total time=   3.1s
[CV 4/5] END model__max_depth=2, model__n_estimators=300;, score=0.230 total time=   3.0s
[CV 5/5] END model__max_depth=2, model__n_estimators=300;, score=0.169 total time=   3.0s
[CV 1/5] END model__max_depth=2, model__

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('prevec',
                                        Pipeline(steps=[('vect',
                                                         CountVectorizer()),
                                                        ('tfidf',
                                                         TfidfTransformer())])),
                                       ('model',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=3,
             param_grid={'model__max_depth': [2, 5, 10],
                         'model__n_estimators': [200, 300, 2000]},
             scoring='accuracy', verbose=4)

In [39]:
print(f"Best model parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Best model parameters: {'model__max_depth': 10, 'model__n_estimators': 2000}
Best score: 0.2036651234567901


In [40]:
y_pred = grid.predict(x_test)

print(f'Acc: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred,target_names=y_train.unique()))

Acc: 0.17098765432098764
              precision    recall  f1-score   support

           5       0.31      0.41      0.35      1620
           1       0.19      0.08      0.11      1620
           3       0.12      0.12      0.12      1620
           2       0.17      0.11      0.13      1620
           0       0.13      0.38      0.20      1620
           4       0.12      0.04      0.06      1620
           7       0.20      0.17      0.18      1620
           6       0.12      0.06      0.08      1620

    accuracy                           0.17     12960
   macro avg       0.17      0.17      0.15     12960
weighted avg       0.17      0.17      0.15     12960



### Stacked Model

In [13]:
from sklearn.ensemble import StackingClassifier

In [14]:
NaBa = ('NaBa',MultinomialNB())
sgdc = ('SGDCl',SGDClassifier(l1_ratio= 0.1, loss= 'modified_huber', penalty= 'elasticnet'))
logreg = LogisticRegression(dual= False, max_iter= 100, penalty= 'l2')

stack = StackingClassifier(estimators=(NaBa,sgdc),
                            final_estimator=logreg)
                            
pipe = Pipeline([('prevec',wordvecpipe),('stackmodel',stack)])

In [15]:
pipe.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('prevec',
                 Pipeline(steps=[('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer())])),
                ('stackmodel',
                 StackingClassifier(estimators=(('NaBa', MultinomialNB()),
                                                ('SGDCl',
                                                 SGDClassifier(l1_ratio=0.1,
                                                               loss='modified_huber',
                                                               penalty='elasticnet'))),
                                    final_estimator=LogisticRegression()))])

In [16]:
y_pred = pipe.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=y_train.unique()))

accuracy 0.20146604938271606
              precision    recall  f1-score   support

           5       0.35      0.39      0.37      1620
           1       0.23      0.32      0.27      1620
           3       0.15      0.16      0.15      1620
           2       0.16      0.07      0.10      1620
           0       0.15      0.16      0.16      1620
           4       0.14      0.09      0.11      1620
           7       0.22      0.22      0.22      1620
           6       0.16      0.19      0.17      1620

    accuracy                           0.20     12960
   macro avg       0.19      0.20      0.19     12960
weighted avg       0.19      0.20      0.19     12960



### Export Stacked Model

In [17]:
from joblib import dump
#dump(pipe, 'class_model.joblib')

['class_model.joblib']