# Notebook to predict citations count
* Using arXiv data
* Using pubmed data

In [None]:
import logging
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from pysrc.config import *
from pysrc.papers.db.pm_postgres_loader import PubmedPostgresLoader
from pysrc.prediction.predict_analyzer import PredictAnalyzer


logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
config = PubtrendsConfig(test=False)

loader = PubmedPostgresLoader(config)
# Analyzer configures progress in loader,
analyzer = PredictAnalyzer(loader, config)

In [None]:
ids = analyzer.search(limit=10_000)

In [None]:
analyzer.analyze(ids)

### Create balanced dataset with respect to cytations count

In [None]:
# Cytations histogram
analyzer.df['total'].hist(bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500, 1000])
plt.show()

In [None]:
# Balance dataset of papers without any citations and with citations.
papers_df_not_null = analyzer.df[analyzer.df.total > 0]
papers_df_null = analyzer.df[analyzer.df.total == 0].head(n=10000)
papers_df = pd.concat([papers_df_not_null, papers_df_null]).drop(columns=['aux'])

In [None]:
# Balanced cytations histogram
papers_df['total'].hist(bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500, 1000])
plt.show()

In [None]:
# Histogram by year
papers_df['year'].hist()
plt.show()

In [None]:
# Filter only 1975-2015 years
papers_df_years = papers_df[np.logical_and(1975 <= papers_df.year, papers_df.year <= 2015)]

### Add topics using LDA algorithm to dataframe only based on texts, without graph information

In [None]:
n_topics = 20
topic_names = [f'topic{i}' for i in range(n_topics)]

In [None]:
start_year, end_year = 1995, 2016  # end year exclusive
topics_info = {}

In [None]:
def find_topic(row):
    index = np.argmax(np.array(row))
    return index

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from pysrc.papers.analysis.text import vectorize_corpus


# Note: this method was restored in papers.utils to restore notebook functionality.
# TODO: Refactor with newer code.
def lda_topics_df(df, n_words, n_topics):
    logging.info(f'Building corpus from {len(df)} articles')
    corpus, corpus_tokens, corpus_counts = vectorize_corpus(
            df,
            max_features=VECTOR_WORDS,
            min_df=VECTOR_MIN_DF,
            max_df=VECTOR_MAX_DF
        )

    logging.info(f'Performing LDA topic analysis')
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(corpus_counts)

    topics = lda.transform(corpus_counts)
    logging.info('Done')
    return corpus_tokens, corpus_counts, topics, lda

In [None]:
def explain_lda_topics(corpus_tokens, corpus_counts, lda, n_top_words=20):
    explanations = {}
    for i, topic in enumerate(lda.components_):
        explanations[i] = [(topic[i], corpus_tokens[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]
    return explanations

In [None]:
for year in range(start_year, end_year):
    logging.info(f"Find topics for year {year}")
    corpus_tokens, corpus_counts, topics, lda = lda_topics_df(
        papers_df_years[papers_df_years.year <= year], n_words=1000, n_topics=n_topics
    )

    topics_df = pd.DataFrame(data=topics, columns=topic_names)
    topics_df.index = papers_df_years[papers_df_years.year <= year].index
    topics_df['main_topic'] = topics_df[topic_names].apply(lambda row: find_topic(row), axis=1)

    explanations = explain_lda_topics(corpus_tokens, corpus_counts, lda, n_top_words=20)

    topics_info[year] = {'topics': topics_df, 'lda': lda, 
                         'corpus_tokens': corpus_tokens, 'corpus_counts': corpus_counts, 
                         'explanations': explanations}

In [None]:
topics_info[2000]['explanations']

In [None]:
# count citations before each year (including this year)
def before_year_citations(df):
    df[f'before_{start_year}'] = df[start_year]
    for year in range(start_year + 1, end_year):
        df[f'before_{year}'] = df[f'before_{year - 1}'] + df[year]

In [None]:
before_year_citations(papers_df_years)

In [None]:
topic_citations = {}
topic_ranks = {}
for year in range(start_year, end_year):
    logging.info(f"Counting topics citations for year {year}")
    # count citations of each topic
    topic_citations[year] = []
    for i in range(n_topics):
        p_topic_i = topics_info[year]['topics'][f'topic{i}']

        cit_documents = papers_df_years[papers_df_years.year <= year][f'before_{year}']
        assert p_topic_i.shape[0] == cit_documents.shape[0]

        topic_citations[year].append(np.dot(p_topic_i, cit_documents))
    topic_ranks[year] = pd.Series(topic_citations[year]).rank(ascending=False, method='min')

In [None]:
def get_topic_rank(row):
    df = topics_info[row.year]['topics']
    main_topic = df.loc[row.name, :]['main_topic']
    topics_rank = topic_ranks[row.year][main_topic]
    return topics_rank

In [None]:
papers_df_years['topic_rank'] = papers_df_years[papers_df_years.year >= 1995].apply(lambda row: get_topic_rank(row),
                                                                                    axis=1)

In [None]:
def get_diversity(row):
    topic_probs = topics_info[row.year]['topics'].loc[row.name, :][topic_names]
    return np.dot(list(topic_probs), np.log(list(topic_probs)))

In [None]:
papers_df_years['diversity'] = papers_df_years[papers_df_years.year >= 1995].apply(lambda row: get_diversity(row),
                                                                                   axis=1)

### Preprocessing (aurhors and journals)

In [None]:
! pip install scholarmetrics pbr
from statistics import mean
from scholarmetrics import hindex, gindex
# An h-index of x means that the author has at least x publications that have been cited at least x times.
# An g-index of x means that the author’s top x publications together accumulated at least x2 citations.

In [None]:
def citations_after_n_years(row, n):
    paper_year = row['year']
    cit = 0
    for cur_year in range(paper_year, paper_year + n):
        if cur_year in row:
            cit += row[cur_year]
    return cit

In [None]:
def journal_rank_and_mean(df):
    journals_citations_years = pd.DataFrame()
    for year in range(start_year, end_year):
        journals_citations_cur = df[df.year <= year][['journal', 'total']].groupby(['journal']).agg(
            {'total': 'mean'}).reset_index().rename(columns={'total': 'journal_citations'})

        journals_citations_cur['journal'].replace({'': np.nan, -1: np.nan}, inplace=True)
        journals_citations_cur.dropna(subset=['journal'], inplace=True)

        journals_citations_cur['rank'] = journals_citations_cur['journal_citations'].rank(ascending=False,
                                                                                          method='min')
        journals_citations_cur['year'] = year
        journals_citations_years = pd.concat([journals_citations_years, journals_citations_cur], axis=0)

    return journals_citations_years

In [None]:
def build_journals_graph(df, cit_df):
    with_journal_out = pd.merge(cit_df, df[['id', 'journal']], left_on='id_out', right_on='id').rename(
        columns={'journal': 'journal_out'}).drop(columns=['id'])

    journal_edges = pd.merge(with_journal_out, df[['id', 'journal']], left_on='id_in', right_on='id').rename(
        columns={'journal': 'journal_in'}).drop(columns=['id'])[['journal_out', 'journal_in']]
    # clear empty journals
    journal_edges.replace({'': np.nan}, inplace=True)
    journal_edges.dropna(inplace=True)

    journal_edges = journal_edges.groupby(['journal_out', 'journal_in']).size().reset_index(name='weight')

    # build graph
    journal_graph = nx.from_pandas_edgelist(journal_edges, 'journal_out', 'journal_in', 'weight')

    return journal_graph


In [None]:
pagerank_journals = {}
pagerank_journals_df = {}
for year in range(start_year, end_year):
    logging.info(f"Count pagerank of each journal for year {year}")
    journals_graph = build_journals_graph(papers_df_years[papers_df_years.year <= year], analyzer.cit_df)
    pagerank_journals[year] = nx.pagerank(journals_graph, alpha=0.85)
    pagerank_journals_df[year] = pd.DataFrame([pagerank_journals[year]]).transpose().reset_index()
    pagerank_journals_df[year].columns = ['journal', 'pagerank']

In [None]:
journals_citations_years = journal_rank_and_mean(papers_df_years)

In [None]:
journals_citations_years.head()

In [None]:
from pysrc.papers.analysis.metadata import split_df_list


def build_authors_graph(df, cit_df):
    authors_df = df[['authors', 'id']]
    authors_df['authors'].replace({'': np.nan, -1: np.nan}, inplace=True)
    authors_df.dropna(subset=['authors'], inplace=True)

    authors_df = split_df_list(authors_df, target_column='authors', separator=', ').rename(
        columns={'authors': 'author'})

    with_author_out = pd.merge(cit_df, authors_df, left_on='id_out', right_on='id').rename(
        columns={'author': 'author_out'}).drop(columns=['id'])

    author_edges = pd.merge(with_author_out, authors_df, left_on='id_in', right_on='id').rename(
        columns={'author': 'author_in'}).drop(columns=['id'])[['author_out', 'author_in']]

    # clear empty authors
    author_edges.replace({'': np.nan}, inplace=True)
    author_edges.dropna(inplace=True)

    author_edges = author_edges.groupby(['author_out', 'author_in']).size().reset_index(name='weight')

    # build graph
    author_graph = nx.from_pandas_edgelist(author_edges, 'author_out', 'author_in', 'weight')

    return author_graph

### Count pagerank for graph of authors citation and productivity

In [None]:
author_graph_features = {}
for year in range(start_year, end_year):
    logging.info(f"Started counting graph of authors citations for year {year}")
    authors_graph = build_authors_graph(papers_df_years[papers_df_years.year <= year], analyzer.cit_df)
    author_graph_features[year] = nx.pagerank(authors_graph, alpha=0.85)

    author_graph_features[year] = {k: {'pagerank': v, 'productivity': 0}
                                   for k, v in author_graph_features[year].items()}
    for author, _, weight in nx.selfloop_edges(authors_graph, data='weight'):
        author_graph_features[year][author]['productivity'] = weight

In [None]:
def author_features(df):
    author_total = df[['authors', 'total', 'year']]
    author_total['authors'].replace({'': np.nan, -1: np.nan}, inplace=True)
    author_total.dropna(subset=['authors'], inplace=True)
    author_total['co_authors'] = author_total['authors'].apply(lambda authors: len(authors.split(', ')) - 1)

    author_total = split_df_list(author_total, target_column='authors', separator=', ')

    authors_dict_years = {}
    for year in range(start_year, end_year):
        logging.info(f"Started counting authors ranks and mean number citations for year {year}")
        authors_citations_groupped = author_total[author_total.year <= year].groupby(['authors'])
        authors_citations = authors_citations_groupped.agg(
            {'total': ['mean', hindex, gindex], 'co_authors': 'mean'}).reset_index()
        authors_citations.columns = authors_citations.columns.droplevel(level=1)
        authors_citations.columns = ['author', 'total', 'hindex', 'gindex', 'co_authors']

        authors_citations = authors_citations.loc[authors_citations['author'] != '']
        authors_citations['rank'] = authors_citations['total'].rank(ascending=False, method='min')
        cur_authors_dict = authors_citations.set_index('author')[
            ['total', 'rank', 'hindex', 'gindex', 'co_authors']].to_dict(orient='index')

        authors_dict_years[year] = cur_authors_dict

    return authors_dict_years

In [None]:
authors_dict_years = author_features(papers_df_years)

In [None]:
def get_authors_features(row):
    if not row.authors:
        return pd.Series([None, None, None, None])
    year = row['year']
    authors_list = row['authors'].split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]
    default_features = {'rank': None, 'hindex': None, 'gindex': None, 'co_authors': None}
    features_of_given_authors = list(map(lambda author: authors_dict_years[year][author]
    if author in authors_dict_years[year] else default_features,
                                         authors_list))

    ranks = filter(None.__ne__, map(lambda features: features['rank'], features_of_given_authors))
    hindexes = filter(None.__ne__, map(lambda features: features['hindex'], features_of_given_authors))
    gindexes = filter(None.__ne__, map(lambda features: features['gindex'], features_of_given_authors))
    socialities = filter(None.__ne__, map(lambda features: features['co_authors'], features_of_given_authors))

    return pd.Series([mean(ranks), mean(hindexes), mean(gindexes), mean(socialities)])

In [None]:
def get_authors_graph_features(row):
    if not row.authors:
        return pd.Series([None, None])
    year = row['year']
    authors_list = row['authors'].split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]

    default_features = {'pagerank': np.nan, 'productivity': np.nan}
    features_of_given_authors = list(map(lambda author: author_graph_features[year][author]
    if author in author_graph_features[year] else default_features,
                                         authors_list))

    pageranks = filter(None.__ne__, map(lambda features: features['pagerank'], features_of_given_authors))
    productivities = filter(None.__ne__, map(lambda features: features['productivity'], features_of_given_authors))
    return pd.Series([mean(pageranks), mean(productivities)])

In [None]:
def get_authors_papers(authors_str):
    authors_list = authors_str.split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]
    papers_of_given_authors = list(map(lambda author: authors_papers_dict[author]
    if author in authors_papers_dict else 1,
                                       authors_list))

    return pd.Series([mean(papers_of_given_authors), max(papers_of_given_authors)])

In [None]:
def get_authors_citations(authors_str):
    authors_list = authors_str.split(', ')
    if len(authors_list) > 10:
        authors_list = authors_list[:10] + [authors_list[-1]]
    cit_of_given_authors = list(map(lambda author: authors_dict[author] if author in authors_dict else 0,
                                    authors_list))

    return pd.Series([mean(cit_of_given_authors), max(cit_of_given_authors)])

In [None]:
journals_citations_years.head()

In [None]:
def preprocess_as_in_paper(df2, step=5, current_year=2020):
    df = df2.copy()
    df['year'] = df['year'].astype(int)
    df['recency'] = current_year - df['year']
    final_features = ['recency', 'topic_rank', 'diversity']

    #   early citations (cumulative)
    for i in range(1, step + 2):
        feature_name = f'c{i}'
        df[feature_name] = df.apply(lambda row: citations_after_n_years(row, n=i), axis=1)

    logging.info("Done counting early citations")

    features_to_add = ['authors_mean_rank', 'authors_mean_hindex', 'authors_mean_gindex', 'authors_mean_sociality']
    final_features += features_to_add
    df[features_to_add] = df[['authors', 'year']].apply(lambda row: get_authors_features(row), axis=1)

    logging.info("Done counting author rank, h-indexes, g-indexes and sociality")

    features_to_add = ['authors_mean_pagerank', 'authors_mean_productivity']
    final_features += features_to_add
    df[features_to_add] = df[['authors', 'year']].apply(lambda row: get_authors_graph_features(row), axis=1)

    logging.info("Done counting author pagerank and productivity")

    df = pd.merge(df, pagerank_journals_df[year], on='journal', how='left').rename(
        columns={'pagerank': 'journal_pagerank'})
    df = pd.merge(df, journals_citations_years[['journal', 'rank', 'year']], on=['journal', 'year'], how='left').rename(
        columns={'rank': 'journal_rank'})
    final_features += ['journal_pagerank', 'journal_rank']
    logging.info("Done counting rank and pagerank of each journal")

    #   extra features
    df['title_len'] = df['title'].apply(lambda title: 0 if pd.isnull(title) else len(title))
    df['abstract_len'] = df['abstract'].apply(lambda abstract: 0 if pd.isnull(abstract) else len(abstract))
    df['n_authors'] = df['authors'].apply(lambda authors: len(authors.split(', ')))
    final_features += ['title_len', 'abstract_len', 'n_authors']

    final_targets = ['c1', 'c5']

    return df[final_features + final_targets], final_features, final_targets

In [None]:
test_df, features, targets = preprocess_as_in_paper(papers_df_years[papers_df_years.year >= 1995])

In [None]:
features

In [None]:
ten_features = ['recency', 'topic_rank', 'diversity', 'authors_mean_rank', 'authors_mean_hindex',
                'authors_mean_sociality', 'authors_mean_pagerank', 'authors_mean_productivity',
                'journal_pagerank', 'journal_rank']

In [None]:
test_df.isna().sum()

In [None]:
test_df.hist(figsize=(15, 15))
plt.show()

In [None]:
features_to_log = ['authors_mean_pagerank', 'authors_mean_sociality']
logged_features = []
for f in features_to_log:
    if test_df[f].min(skipna=True) >= 0:
        test_df['log_' + f] = test_df[f].apply(lambda x: np.log1p(x))
        logged_features.append('log_' + f)

In [None]:
from sklearn.impute import SimpleImputer
df = test_df.astype(np.float32)
print("Shape before imputing nans", df.shape)
# Imputer silently removes columns, with empty values    
imp = SimpleImputer(missing_values=np.nan, strategy='median')
df = pd.DataFrame(imp.fit_transform(df))
print("Shape after", df.shape)
imputed_features = [c for c in test_df.columns if not test_df[c].isnull().values.all()]
df.columns = imputed_features
print("tranform done")

In [None]:
df.isna().sum()

## Models with different features and target

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

In [None]:
step = 5

In [None]:
def predict(df, features, target, model=LinearRegression(), show_plt=False, log_target=False, n=1):
    features = [f for f in features + [target] if f in imputed_features]
    train_validate = df[df.recency > step + 1][features]
    train = train_validate[train_validate.recency > 11]
    validate = train_validate[train_validate.recency <= 11]

    train.recency = train.recency
    validate.recency = validate.recency

    coefs = []
    r_squared = []
    rmse = []
    for i in range(n):
        frac = 0.8
        train_sample = train.sample(frac=frac)
        X = train_sample.iloc[:, :-1]
        y = train_sample.iloc[:, -1]
        val_sample = validate.sample(frac=frac)
        X_validate = val_sample.iloc[:, :-1]
        y_validate = val_sample.iloc[:, -1]

        if log_target:
            y = np.log(y + 1)
            y_validate = np.log(y_validate + 1)

        scaler = StandardScaler().fit(X)
        X = scaler.transform(X)
        X_validate = scaler.transform(X_validate)

        reg = model.fit(X, y)
        coefs.append(reg.coef_)

        if show_plt:
            x = []
            for a, b in zip(list(y_validate), list(reg.predict(X_validate))):
                if a != 0:
                    x.append(b - a)
            plt.hist(x, bins=50)
            plt.show()

        r_squared.append((reg.score(X, y), reg.score(X_validate, y_validate)))
        rmse.append((sqrt(mse(reg.predict(X), y)), sqrt(mse(reg.predict(X_validate), y_validate))))
        print(f"R^2 train: {reg.score(X, y)} validate: {reg.score(X_validate, y_validate)}")
        print(f"RMSE train: {sqrt(mse(reg.predict(X), y))} validate: {sqrt(mse(reg.predict(X_validate), y_validate))}")

    return reg, (X, y), (X_validate, y_validate), (coefs, r_squared, rmse)

In [None]:
def print_top_influencers(df, features, reg, coefs=None, n=10):
    if coefs:
        maxcoef = np.argsort(-np.abs(coefs))
        coef = np.array(coefs)[maxcoef]
    else:
        maxcoef = np.argsort(-np.abs(reg.coef_))
        coef = reg.coef_[maxcoef]
    top_features = []
    for i in range(0, min(n, len(features))):
        print("{:.<060} {:< 010.4e}".format(df[features].columns[maxcoef[i]], coef[i]))
        top_features.append(df[features].columns[maxcoef[i]])
    return top_features

### 1. Predict c5 given c1

In [None]:
target = 'c5'
all_features = features + ['c1']  # + logged_features
print('All features', len(all_features))
print('Test DF shape', test_df.shape)
all_features.remove('authors_mean_gindex')
reg, (X, y), (X_val, y_val), extras = predict(df, all_features, target, model=LassoCV(cv=5), show_plt=True,
                                              log_target=True)

In [None]:
print_top_influencers(test_df, all_features, reg)

In [None]:
plt.plot(reg.predict(X_val), y_val, 'ro')
plt.show()

### 2. Predict c_5 without any early citations info


In [None]:
all_features.remove('c1')

In [None]:
reg, (X, y), (X_val, y_val), extras = predict(df, all_features, 'c5', show_plt=True, log_target=True)
print_top_influencers(test_df, all_features, reg)

In [None]:
test_df.drop(['c1'], axis=1).to_csv('~/predict.csv', index=None)

#### Or with regularisation (L1 or L2)

In [None]:
print(f"Lasso regularisation; target {target}")
Ls, (X, y), (X_val, y_val), extras = predict(df, all_features, target, model=LassoCV(cv=5), log_target=True)
print_top_influencers(test_df, all_features, Ls)

In [None]:
plt.plot(Ls.predict(X_val), y_val, 'ro')
plt.show()

In [None]:
print(f"Ridge regularisation; target {target}")
Rr, (X, y), (X_val, y_val), extras = predict(df, all_features, target, model=RidgeCV(), log_target=True)
print_top_influencers(test_df, all_features, Rr)

In [None]:
reg, (X, y), (X_val, y_val), (coefs, r_squared, rmse) = predict(df, all_features, target,
                                                                model=LassoCV(cv=5, tol=0.4), log_target=True, n=300)

In [None]:
for j in range(len(all_features)):
    print(all_features[j])
    plt.hist(np.transpose(coefs)[j])
    plt.show()

In [None]:
average_coef = []
for j in range(len(all_features)):
    average_coef.append(mean(np.transpose(coefs)[j]))

In [None]:
top_features = print_top_influencers(df, all_features, reg, coefs=average_coef, n=15)

In [None]:
class linear_regression:
    def __init__(self, coef=[]):
        self.coef = coef

    def fit(self, X, y):
        self.X = preprocessing.scale(X)
        self.y = y
        self.coef = []
        b = sum(y) / len(y)
        a = np.dot(np.dot(np.linalg.pinv(np.dot(self.X.transpose(), self.X)), self.X.transpose()), y)
        self.coef = [b] + a.tolist()

    def predict(self, x):
        predicted_y = np.dot(x, self.coef[1:]) + self.coef[0]
        return predicted_y

In [None]:
def get_train_validate(df2):
    df = df2[all_features + [target]].copy()
    df = df.astype(np.float32)
    print("start fill nans")
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(df)
    df = pd.DataFrame(imp.transform(df))
    df.columns = all_features + [target]

    print("tranform done")

    train_validate = df[df.recency > step + 1][all_features + [target]]
    train = train_validate[train_validate.recency > 11]
    validate = train_validate[train_validate.recency <= 11]
    return train, validate


train, validate = get_train_validate(test_df)
X = train.iloc[:, :-1]
y = np.log1p(train.iloc[:, -1])
X_validate = validate.iloc[:, :-1]
y_validate = np.log1p(validate.iloc[:, -1])
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X_validate = scaler.transform(X_validate)

In [None]:
for i in range(len(all_features)):
    if abs(average_coef[i]) < 0.002:
        average_coef[i] = 0

In [None]:
all_average_coef = [mean(y)] + average_coef

In [None]:
reg_test = linear_regression(coef=all_average_coef)

In [None]:
y_predicted = reg_test.predict(X)

In [None]:
plt.hist(y_predicted)
plt.show()

In [None]:
print(r2_score(y, reg_test.predict(X)), r2_score(y_validate, reg_test.predict(X_validate)))
print(r2_score(np.exp(y), np.exp(reg_test.predict(X))),
      r2_score(np.exp(y_validate), np.exp(reg_test.predict(X_validate))))
print(sqrt(mse(reg.predict(X), y)), sqrt(mse(reg.predict(X_validate), y_validate)))

In [None]:
plt.plot(reg_test.predict(X_val), y_val, 'ro')

### Use desicion tree instead of linear regression 

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
df = test_df[all_features + ['c5']].astype(np.float32)
columns = df.columns

In [None]:
train_validate = df[df.recency > step + 1][all_features + [target]]
train = train_validate[train_validate.recency > 11]
validate = train_validate[train_validate.recency <= 11]

train_sample = train.sample(frac=0.7)
X = train_sample.iloc[:, :-1]
y = np.log(train_sample.iloc[:, -1] + 1)
validate_sample = validate.sample(frac=0.7)
X_validate = validate_sample.iloc[:, :-1]
y_validate = np.log(validate_sample.iloc[:, -1] + 1)

In [None]:
regr = RandomForestRegressor(max_depth=6, min_samples_split=40, n_estimators=1000, n_jobs=-1, verbose=4)
regr.fit(X, y)

In [None]:
print(f"RMSE train: {sqrt(mse(regr.predict(X), y))} validate: {sqrt(mse(regr.predict(X_validate), y_validate))}")

In [None]:
print(f"R^2: train {regr.score(X, y)} validate: {regr.score(X_validate, y_validate)}")

In [None]:
print(r2_score(np.exp(y_validate), np.exp(regr.predict(X_validate))))

In [None]:
plt.plot(regr.predict(X_validate), y_validate, 'ro')
plt.show()

In [None]:
print(f"""R^2: train {r2_score(y, regr.predict(X))} validate: {r2_score(y_validate, regr.predict(X_validate))}""")

In [None]:
def print_top_influencers_tree(df, features, reg, n=10):
    maxcoef = np.argsort(-np.abs(reg.feature_importances_))
    coef = reg.feature_importances_[maxcoef]
    top_features = []
    for i in range(0, min(n, len(features))):
        print("{:.<060} {:< 010.4e}".format(df[features].columns[maxcoef[i]], coef[i]))
        top_features.append(df[features].columns[maxcoef[i]])
    return top_features

In [None]:
top_features = print_top_influencers_tree(test_df, all_features, regr, n=15)

### CatBoost

In [None]:
! pip install catboost
from catboost import Pool, CatBoostRegressor

In [None]:
train_dataset = Pool(data=X, label=y)
eval_dataset = Pool(data=X_validate, label=y_validate)
model = CatBoostRegressor(iterations=1400, use_best_model=True, learning_rate=0.02, max_depth=6, loss_function='RMSE')

model.fit(train_dataset,
          use_best_model=True,
          eval_set=eval_dataset)

In [None]:
print(f"R^2: {model.score(X, y)} validate: {model.score(X_validate, y_validate)}")

In [None]:
plt.plot(model.predict(X_validate), y_validate, 'ro')
plt.show()

In [None]:
importances = model.get_feature_importance(data=train_dataset,
                                           prettified=True,
                                           thread_count=-1,
                                           verbose=False)

In [None]:
importances

## Classification

In [None]:
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, recall_score, precision_score

In [None]:
def train_validate(train, validate):
    X = train[all_features + ['is_top']].iloc[:, :-1]
    y = train[all_features + ['is_top']].iloc[:, -1]
    X_validate = validate[all_features + ['is_top']].iloc[:, :-1]
    y_validate = validate[all_features + ['is_top']].iloc[:, -1]

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    X_validate = scaler.transform(X_validate)
    return X, y, X_validate, y_validate

In [None]:
part = 0.02

In [None]:
train_best = train.nlargest(columns=['c5'], n=int(part * train.shape[0]))
train_min = train_best['c5'].min()
validate_best = validate.nlargest(columns=['c5'], n=int(part * validate.shape[0]))
val_min = validate_best['c5'].min()
train['is_top'] = train['c5'].apply(lambda x: 1 if x > train_min else 0)
validate['is_top'] = validate['c5'].apply(lambda x: 1 if x > val_min else 0)

In [None]:
train_best.groupby(by=['recency'])['c5'].min()

In [None]:
top = train[train['is_top'] == 1]
not_top = train[train['is_top'] == 0]

In [None]:
print(f"top size: {top.shape[0]} not top size: {not_top.shape[0]}")

In [None]:
not_top_downsampled = resample(not_top, replace=False, n_samples=len(top), random_state=27)
# combine minority and downsampled majority
downsampled = pd.concat([not_top_downsampled, top])

In [None]:
top_upsampled = resample(top, replace=True, n_samples=len(not_top), random_state=27)
# combine majority and upsampled minority
upsampled = pd.concat([not_top, top_upsampled])

In [None]:
upsampled.head()

In [None]:
for dataset in [downsampled, upsampled]:
    X, y, X_validate, y_validate = train_validate(dataset, validate)
    target = 'is_top'
    for weight in ([1, 3, 5]):
        print("weight =", weight)
        clf = RandomForestClassifier(max_depth=7, n_estimators=1000, class_weight={0: 1, 1: weight})
        clf.fit(X, y)

        print("score train: ", clf.score(X, y), "validate :", clf.score(X_validate, y_validate))
        tn, fp, fn, tp = confusion_matrix(y_validate, clf.predict(X_validate)).ravel()
        print(tn, fp, fn, tp)
        print("recall :", recall_score(y_validate, clf.predict(X_validate)))
        print("precision :", precision_score(y_validate, clf.predict(X_validate)))
        print()