In [5]:
import pickle
from importlib import reload
import numpy as np
from scipy import sparse
from zipfile import ZipFile
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
import features
reload(features)
from features import load_features

In [7]:
import submission
reload(submission)
from submission import create_submission

In [8]:
with ZipFile('../data/train.csv.zip') as z:
    with z.open('train.csv') as f:
        train_df = pd.read_csv(f)
y = train_df['deal_probability'].copy()
del train_df

In [9]:
with open('../feature_exploration/text_features/title_tfidf_vectorizer.pkl', 'rb') as f:
    title_tfv = pickle.load(f)

In [10]:
with open('../feature_exploration/text_features/descrition_tfidf_vectorizer.pkl', 'rb') as f:
    description_tfv = pickle.load(f)

In [11]:
title_names = title_tfv.get_feature_names()

In [12]:
title_names[:10]

['128gb', '16g', '16gb', '16гб', '1gb', '2gb', '2в1', '2м', '2х', '2шт']

In [13]:
description_names = description_tfv.get_feature_names()

In [14]:
description_names[:10]

['1000р', '100р', '150р', '16gb', '1шт', '200р', '2х', '2шт', '300р', '3d']

In [15]:
del title_tfv, description_tfv

In [16]:
feature_names = ['categorical_one_hot', 'item_seq_number', 'price', 'title_length_chars', 'description_length_chars',
                 'title_tfidf', 'description_tfidf']

In [17]:
_, train_features, categorical_indices = load_features('train', feature_names)

In [18]:
train_features = sparse.hstack(train_features)

In [19]:
from sklearn.preprocessing import StandardScaler, Imputer

In [20]:
imputer = Imputer()
train_features = imputer.fit_transform(train_features)

In [21]:
scaler = StandardScaler(with_mean=False)
train_features = scaler.fit_transform(train_features)

In [22]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(train_features, y)

In [24]:
ridge = Ridge()

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
np.sqrt(mean_squared_error(y_test, ridge.predict(X_test)))

0.2299234155774438

In [28]:
ridge = Ridge()

In [30]:
%time ridge.fit(train_features, y)

Wall time: 5min 1s


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [35]:
_, test_features, _ = load_features('test', feature_names)

In [36]:
test_features = sparse.hstack(test_features)

In [39]:
test_features = test_features.tocsr()

In [40]:
test_features = imputer.transform(test_features)

In [41]:
test_features = scaler.transform(test_features)

In [42]:
with ZipFile('../data/test.csv.zip') as z:
    with z.open('test.csv') as f:
        test_df = pd.read_csv(f)

In [43]:
create_submission(ridge.predict(test_features), '../submissions/', 'ridge_test', test_df)

In [51]:
train_predictions = ridge.predict(train_features)

In [52]:
test_predictions = ridge.predict(test_features)

In [53]:
from scipy.sparse import save_npz, csr_matrix

In [55]:
with open('../feature_exploration/num_features/train/ridge_predictions.npz', 'wb') as f:
    save_npz(f, csr_matrix(train_predictions.reshape(-1, 1)))
with open('../feature_exploration/num_features/test/ridge_predictions.npz', 'wb') as f:
    save_npz(f, csr_matrix(test_predictions.reshape(-1, 1)))