# Modeling!

In [6]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import re
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder


## Textual Data processing

1. Tokenization
2. Stopword removal
3. Stemming
4. Lemmatization
5. Vectorization

In [5]:
# loading the dataset
dataset_pd = pd.read_parquet('dataset/dataset_labeled.parquet')
dataset_pd.head()

Unnamed: 0_level_0,page_count,figure_count,author_count,year,month,day,text,page_imputed,citation_bucket
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10.1002/adfm.202001307,25.0,13.0,5,2020,7,30,"Qinghua Zhao, Wanqi Jie, Tao Wang, Andres Cast...",False,high
10.1002/cphc.200900857,15.0,4.0,8,2010,2,4,"Haifeng Ma, Thomas Brugger, Simon Berner, Yun ...",True,high
10.1002/prop.200710532,5.0,0.0,2,2015,5,20,"Milovan Vasilic, Marko Vojinovic Interaction o...",False,low
10.1007/978-3-030-30493-5_44,15.0,0.0,3,2019,12,3,"Itay Mosafi, Eli David, Nathan S. Netanyahu De...",True,low
10.1007/s00025-018-0843-4,15.0,0.0,2,2018,6,8,Deepshikha and Lalit K. Vashisht Weaving K-fra...,True,high


### Attempt 1: Naive Bag of Words
- No stopword removal
- No stemming
- No lemmatization


In [None]:
# create the count vectorizer
count_vect = CountVectorizer(stop_words=None, tokenizer=None, analyzer="word")

categories = ["low", "medium", "high", "star"]

# create the pipeline
text_pipeline = Pipeline([("count_vect", count_vect)])

label_pipeline = Pipeline(
    [("citation_bucket", OrdinalEncoder(categories=categories))]
)

date_features = (
    "date_features",
    ColumnTransformer(
        [
            (
                "year_month_day",
                MinMaxScaler(),
                ["year", "month", "day"],
            ),
        ],
        remainder="passthrough",
    ),
)

ratio_features = (
    "ratio_features",
    ColumnTransformer(
        [
            (
                "counts",
                StandardScaler(),
                ["page_count", "figure_count", "author_count"],
            )
        ],
        remainder="passthrough",
    ),
)

feature_pipeline = Pipeline(
    [
        date_features,
        ratio_features,
    ]
)

pipeline = Pipeline(
    [
        (
            "features",
            ColumnTransformer(
                [
                    ("text", count_vect, "text"),
                    ("other", "passthrough", ["Name", "Age", "Gender"]),
                ]
            ),
        )
    ]
)

## Feature Selection and dimensionality reduction
Analyze the data!


## Model training and evaluation

1. Split data into train, val and test set
2. Select appropriate model candidates
3. Train models, evaluate them
4. Select best model
5. Perform hyperparameter tuning