In [62]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import json
from sklearn import svm

In [63]:
# Read train data
train_df = pd.read_csv("../data/gtrends.csv")
train_labels_df = pd.read_csv("../data/labels.csv", index_col=0)
class_meaning = json.load(open("../data/class_meaning.json", "r"))

# Feature extraction
Computing the features of each time series :

- Mean
- Maximum z-score :
    
    $ z = \frac{x - \mu}{\sigma},  $ where:
    - $ x $ is the maximum value of the time series
    - $ \mu $ is the mean of the time series
    - $ \sigma $ is the standard deviation of the time series
- Autocorrelation coefficient (Pearson's) :

    Using the AutoCorrelation function from Pandas library,
    it implements the Pearson's correlation coefficient between the time series and a lagged version of itself.

In [64]:
def feature_ext(dataframe):
    df = pd.DataFrame()
    for col in dataframe.columns:
        if col != "date":
            data_col = dataframe[col] / 100 # normalize data
            d = {
                "mean": data_col.mean(),
                "max_zscore": ((max(data_col) - data_col.mean()) / (data_col.std())),
                "autocorr": data_col.autocorr(lag=53), # 1 year lag
                "topic": col,
            }
            i = [0]
            df_tmp = pd.DataFrame(data=d, index=i)
            df = pd.concat([df, df_tmp], ignore_index=True, sort=False)

    return df

In [65]:
features = feature_ext(train_df)

In [66]:
def plot_data(features, labels_df=""):
    if labels_df == "":
        fig = px.scatter_3d(
            features,
            y="max_zscore",
            x="autocorr",
            z="mean",
            text="topic",
        )
    else:
        fig = px.scatter_3d(
            features,
            y="max_zscore",
            x="autocorr",
            z="mean",
            text="topic",
            color=labels_df,
        )
    fig.show()

In [67]:
plot_data(features, "")

In [68]:
def drop_rows_with_nan(df):
    # The model can't handle NaN values so we drop them
    df = df.dropna(axis=0, how="any")
    return df

In [69]:
features = drop_rows_with_nan(features)

In [70]:

# Appendig labels to features extracted from train data
features = features.merge(train_labels_df, left_on="topic", right_index=True)

In [71]:
def remove_minus_one_label(features):
    # remove -1 labels, which are not used to train the model
    minus = features[features["class"] == -1]
    features = features[features["class"] != -1]
    return features, minus

In [72]:
features, minus = remove_minus_one_label(features)

In [73]:
plot_data(features, "class")

# Model training

In [74]:
def create_svm_model(features):
    # Creating a training a SVM model with linear kernel
    X = features[["max_zscore", "autocorr", "mean"]]
    Y = features["class"]
    clf = svm.SVC(kernel="linear").fit(X, Y)
    #print("Support vectors: ", clf.support_vectors_)
    print("Intercept: ", clf.intercept_)
    #print("Number of support vectors for each class: ", clf.n_support_)
    print("Coefficients of separating hyperplanes: ", clf.coef_)
    return clf

In [75]:
svm_model = create_svm_model(features)

Intercept:  [-0.58514926 -0.41882322  6.22055968  0.73283403  7.43921992  6.29679827]
Coefficients of separating hyperplanes:  [[-0.22967964  0.97472781  2.77785143]
 [ 0.07720841 -2.238815    2.1740481 ]
 [-1.09218307 -0.05987852  0.24852723]
 [ 0.15628067 -2.34686884 -1.20051904]
 [-1.20910459 -0.28440432 -0.46023121]
 [-1.10849616  0.54936155 -0.12274472]]


# Plotting the separation plane of the SVM
The smv model of scikit-learn gives back the coefficients to calculate the equation of the separating hyperplane.

The equation of the separating hyperplane is :
$ f(x,y) = w_0 + w_1x + w_2y $.

Where $ w_0 $ is the intercept_ and $ w_1 $ and $ w_2 $ are the coef_.

In [76]:
def plot_svm_nth_separation_plane(features, svm_model, plane_number):
    # plot features
    fig = px.scatter_3d(
        features,
        x="max_zscore",
        y="autocorr",
        z="mean",
        text="topic",
        color="class",
    )
    # plot nth separation plane
    x_min, x_max = (
        features["max_zscore"].min(),
        features["max_zscore"].max(),
    )
    y_min, y_max = (
        features["autocorr"].min(),
        features["autocorr"].max(),
    )
    x, y = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
    z = (
        lambda x, y: (
            -svm_model.intercept_[plane_number]
            - svm_model.coef_[plane_number][0] * x
            - svm_model.coef_[plane_number][1] * y
        )
        / svm_model.coef_[plane_number][2]
    )
    fig.add_traces(
        go.Surface(
            x=x,
            y=y,
            z=z(x, y),
            showscale=False,
            opacity=0.5,
            colorscale="Blues",
            name="SVM separation plane",
        )
    )
    fig.show()

In [77]:
plot_svm_nth_separation_plane(features, svm_model, 0) # 0 to 5 planes

# Predicting the classes with the SVM

In [78]:
def predic_with_svm_model(clf, features):
    X = features[["max_zscore", "autocorr", "mean"]]
    # append the class prediction to the features dataframe
    features["svm_label"] = clf.predict(X)
    return features

# Testing the model

Here we download the data of some topics and use the model to predict the classes of the time series.

In [79]:
import google_trends.dl_google_trends as gt
from datetime import datetime, timedelta

# defining time range
start_date = str(datetime.today() - timedelta(days=365 * 4))
end_date = str(datetime.today())
# topics to download
topic = ["sciarpa","1 gennaio 2022","rosso"]

# downloading data from google trends
# take at least 1 min to download data
gtrends_series = gt.get_google_trends(
    topic, start_date, end_date, "", "it-IT"
)

# seve data to csv
gtrends_series.to_csv("../data/gtrends_downloaded.csv")

Downloading Google Trends...


100%|██████████| 3/3 [03:03<00:00, 61.16s/it]


# Showing the results

In [80]:
gtrends_series = pd.read_csv("../data/gtrends_downloaded.csv")

gtrends_series_features = feature_ext(gtrends_series)

gtrends_series_features = drop_rows_with_nan(gtrends_series_features)

if len(gtrends_series_features) > 0:
    gtrends_series_features = predic_with_svm_model(svm_model, gtrends_series_features)

    # print predicted class
    for i in range(0, len(gtrends_series_features)):
        print(
            f"Topic : {gtrends_series_features['topic'][i]} - Predicted class : {gtrends_series_features['svm_label'][i]} - {class_meaning[str(gtrends_series_features['svm_label'][i])]}"
        )

    fig = px.line(gtrends_series, x="date", y=topic)
    fig.update_layout(yaxis_range=[0, 100])
    fig.show()

Topic : sciarpa - Predicted class : 2 - Hot topic recurrently
Topic : 1 gennaio 2022 - Predicted class : 3 - Hot topic for a single time
Topic : rosso - Predicted class : 0 - Hot topic
