In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
print(train_data.describe())

         PRODUCT_ID  PRODUCT_TYPE_ID  PRODUCT_LENGTH
count  2.249698e+06     2.249698e+06    2.249698e+06
mean   1.499795e+06     4.000456e+03    4.071839e+03
std    8.661944e+05     3.966146e+03    1.351685e+06
min    1.000000e+00     0.000000e+00    1.000000e+00
25%    7.494795e+05     2.300000e+02    5.118110e+02
50%    1.499558e+06     2.916000e+03    6.630000e+02
75%    2.250664e+06     6.403000e+03    1.062992e+03
max    2.999999e+06     1.342000e+04    1.885801e+09


In [11]:
train_data.corr()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
PRODUCT_ID,1.0,-0.017143,-0.466076,-0.446342,0.250576,0.000424
TITLE,-0.017143,1.0,0.035067,0.081965,-0.030884,0.001279
BULLET_POINTS,-0.466076,0.035067,1.0,0.341391,-0.24236,-0.00149
DESCRIPTION,-0.446342,0.081965,0.341391,1.0,-0.195562,-0.000996
PRODUCT_TYPE_ID,0.250576,-0.030884,-0.24236,-0.195562,1.0,0.000961
PRODUCT_LENGTH,0.000424,0.001279,-0.00149,-0.000996,0.000961,1.0


In [12]:
train_data.corr(method='spearman')

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
PRODUCT_ID,1.0,-0.017125,-0.562488,-0.530011,0.316873,0.095589
TITLE,-0.017125,1.0,0.034146,0.052201,-0.02341,-0.009761
BULLET_POINTS,-0.562488,0.034146,1.0,0.49451,-0.380302,-0.127709
DESCRIPTION,-0.530011,0.052201,0.49451,1.0,-0.298203,-0.03347
PRODUCT_TYPE_ID,0.316873,-0.02341,-0.380302,-0.298203,1.0,0.076806
PRODUCT_LENGTH,0.095589,-0.009761,-0.127709,-0.03347,0.076806,1.0


In [6]:
mode_value = train_data['PRODUCT_LENGTH'].mode()
print(mode_value)

0    600.0
Name: PRODUCT_LENGTH, dtype: float64


In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_data['DESCRIPTION'] = le.fit_transform(train_data['DESCRIPTION'])

# Convert the 'Feature' column from integer to float
train_data['DESCRIPTION']  =train_data['DESCRIPTION'] .astype(float)

In [8]:

train_data['BULLET_POINTS'] = le.fit_transform(train_data['BULLET_POINTS'])

# Convert the 'Feature' column from integer to float
train_data['BULLET_POINTS']  =train_data['BULLET_POINTS'] .astype(float)

In [9]:

train_data['TITLE'] = le.fit_transform(train_data['TITLE'])

# Convert the 'Feature' column from integer to float
train_data['TITLE']  =train_data['TITLE'] .astype(float)

In [10]:
train_data.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,209798.0,450338.0,745275.0,1650,2125.98
1,2673191,1200569.0,392317.0,745275.0,2755,393.7
2,2765088,1408667.0,466533.0,572211.0,7537,748.031495
3,1594019,87193.0,479203.0,204578.0,2996,787.401574
4,283658,1915247.0,965330.0,745275.0,6112,598.424


In [13]:
train_data.isnull().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64

In [14]:
train_data.dtypes

PRODUCT_ID           int64
TITLE              float64
BULLET_POINTS      float64
DESCRIPTION        float64
PRODUCT_TYPE_ID      int64
PRODUCT_LENGTH     float64
dtype: object

In [15]:
test_data['DESCRIPTION'] = le.fit_transform(test_data['DESCRIPTION'])

# Convert the 'Feature' column from integer to float
test_data['DESCRIPTION']  =test_data['DESCRIPTION'] .astype(float)

In [16]:
test_data['BULLET_POINTS'] = le.fit_transform(test_data['BULLET_POINTS'])

# Convert the 'Feature' column from integer to float
test_data['BULLET_POINTS']  =test_data['BULLET_POINTS'] .astype(float)

In [17]:
test_data['TITLE'] = le.fit_transform(test_data['TITLE'])

# Convert the 'Feature' column from integer to float
test_data['TITLE']  =test_data['TITLE'] .astype(float)

In [18]:
test_data.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,393756.0,355169.0,274848.0,6142
1,1729783,163606.0,243872.0,18130.0,1622
2,1871949,302369.0,134563.0,195809.0,7540
3,1107571,479331.0,52053.0,178367.0,12442
4,624253,674400.0,355169.0,274848.0,6318


In [19]:
test_data.isnull().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
dtype: int64

In [20]:
test_data.dtypes

PRODUCT_ID           int64
TITLE              float64
BULLET_POINTS      float64
DESCRIPTION        float64
PRODUCT_TYPE_ID      int64
dtype: object

In [21]:
test_data['PRODUCT_LENGTH'] = 600

In [22]:
X_train = train_data.drop(['TITLE'], axis = 1).values
X_test = test_data.drop(['TITLE'], axis = 1).values
y_train = train_data['PRODUCT_LENGTH'].values

In [23]:
X_train

array([[1.92520200e+06, 4.50338000e+05, 7.45275000e+05, 1.65000000e+03,
        2.12598000e+03],
       [2.67319100e+06, 3.92317000e+05, 7.45275000e+05, 2.75500000e+03,
        3.93700000e+02],
       [2.76508800e+06, 4.66533000e+05, 5.72211000e+05, 7.53700000e+03,
        7.48031495e+02],
       ...,
       [1.98778600e+06, 2.79205000e+05, 6.94597000e+05, 1.57400000e+03,
        1.20000000e+03],
       [1.16575400e+06, 9.65330000e+05, 7.45275000e+05, 5.92000000e+02,
        2.90000000e+03],
       [1.07266600e+06, 2.44620000e+05, 5.64190000e+05, 7.36700000e+03,
        2.00000000e+03]])

In [24]:
X_test

array([[6.043730e+05, 3.551690e+05, 2.748480e+05, 6.142000e+03,
        6.000000e+02],
       [1.729783e+06, 2.438720e+05, 1.813000e+04, 1.622000e+03,
        6.000000e+02],
       [1.871949e+06, 1.345630e+05, 1.958090e+05, 7.540000e+03,
        6.000000e+02],
       ...,
       [8.415290e+05, 3.588100e+04, 7.729000e+03, 1.064500e+04,
        6.000000e+02],
       [1.190194e+06, 1.563020e+05, 1.224480e+05, 1.268000e+04,
        6.000000e+02],
       [1.040810e+06, 1.151850e+05, 1.786900e+05, 0.000000e+00,
        6.000000e+02]])

In [25]:
y_train

array([2125.98     ,  393.7      ,  748.0314953, ..., 1200.       ,
       2900.       , 2000.       ])

In [26]:
model = XGBRegressor(verbosity = 0)
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [27]:
y_pred = model.predict(X_test)

In [28]:
y_pred

array([591.6246, 591.6246, 591.6246, ..., 591.6246, 591.6246, 591.6246],
      dtype=float32)

In [29]:
submission = pd.DataFrame({'PRODUCT_ID': test_data['PRODUCT_ID'], 'PRODUCT_LENGTH': y_pred})
submission.to_csv('Heracross_submission.csv', index=False)