In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
import time
from ipywidgets import IntProgress
from IPython.display import display

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
review = pd.read_csv('review.csv', dtype={'comment': str, 'game': str})
review_sub = pd.read_csv('review_submission.csv', dtype={'comment': str, 'game': str})
review['comment'].fillna('', inplace=True)

In [5]:
games = pd.read_csv('games.csv', dtype={"desc": str})
games_sub = pd.read_csv('games_submission.csv', dtype={"desc": str})

### Предиктим оценку по описанию игры

In [6]:
games["desc"]=games["desc"].astype(str)

In [7]:
f = IntProgress(min=0, max=len(games))
display(f)

games_vec = []
for i in games["desc"]:
    games_vec.append(nlp(i).vector)
    f.value += 1

IntProgress(value=0, max=10618)

In [8]:
len(review_sub["game"].unique())

21334

In [9]:
len(games_sub)

10556

In [10]:
len(review["game"].unique())

21323

In [11]:
len(games)

10618

In [12]:
games_vec = pd.DataFrame(games_vec)
games = pd.concat([games,games_vec],axis=1)

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

In [14]:
lm_games = LinearRegression()
scores = cross_val_score(lm_games, games[games.columns[3:]], games["score"], cv=5,
                                   scoring='neg_mean_absolute_percentage_error')

In [15]:
scores

array([-0.11060308, -0.11192853, -0.10908101, -0.11419794, -0.11081282])

In [16]:
lm_games.fit(games[games.columns[3:]], games["score"])

LinearRegression()

### Предиктим оценку по описанию игры

In [17]:
f = IntProgress(min=0, max=len(games))
display(f)

games_sub_vec = []
for i in games_sub["desc"]:
    games_sub_vec.append(nlp(i).vector)
    f.value += 1

IntProgress(value=0, max=10618)

In [18]:
games_sub_vec = pd.DataFrame(games_sub_vec)
games_sub = pd.concat([games_sub,games_sub_vec],axis=1)

In [19]:
games_sub["score"] = lm_games.predict(games_sub[games_sub.columns[1:]])

### Векторизуем review 

In [20]:
f = IntProgress(min=0, max=len(review))
display(f)

review_vec = []
for i in review["comment"]:
    review_vec.append(nlp(i).vector)
    f.value += 1

IntProgress(value=0, max=1684309)

In [21]:
len(review_vec)

1684309

In [22]:
review_vec = pd.DataFrame(review_vec)
review = pd.concat([review,review_vec],axis=1)

In [23]:
review.isna().mean()

comment    0.000000
game       0.000000
rating     0.000000
0          0.001727
1          0.001727
             ...   
91         0.001727
92         0.001727
93         0.001727
94         0.001727
95         0.001727
Length: 99, dtype: float64

### Векторизуем review_sub

In [24]:
review_sub["comment"]=review_sub["comment"].astype(str)

In [25]:
f = IntProgress(min=0, max=len(review_sub))
display(f)

review_sub_vec = []
for i in review_sub["comment"]:
    review_sub_vec.append(nlp(i).vector)
    f.value += 1

IntProgress(value=0, max=1684310)

In [26]:
review_sub_vec = pd.DataFrame(review_sub_vec)

In [27]:
review_sub = pd.concat([review_sub,review_sub_vec],axis=1)

### Оценим модель для пользовательской оценки игры

In [28]:
lm_review = LinearRegression()
scores_rew = cross_val_score(lm_review, review[review.columns[3:]].dropna(), review.dropna()["rating"], cv=5,
                                   scoring='neg_mean_absolute_percentage_error')

In [29]:
scores_rew

array([-0.28101728, -0.27996849, -0.28106469, -0.50192912, -0.2807063 ])

In [30]:
lm_review.fit(review[review.columns[3:]].dropna(), review.dropna()["rating"])

LinearRegression()

In [31]:
review_sub["rating"] = lm_review.predict(review_sub[review_sub.columns[2:]])

In [32]:
review_sub["rating"]

0          7.117733
1          7.546399
2          6.655129
3          7.268070
4          6.631054
             ...   
1684305    6.597529
1684306    6.707068
1684307    6.723706
1684308    6.729539
1684309    7.039034
Name: rating, Length: 1684310, dtype: float64

In [33]:
rating=review_sub["rating"]
review_sub=review_sub.drop("rating",axis=1)

In [34]:
review_sub.insert(2, 'rating', rating)

In [35]:
review_end=pd.concat([review,review_sub])

#### Сохраняю все преобразованные датасеты

In [36]:
review_end.to_csv("review_end.csv")
games_sub.to_csv("games_sub_num.csv")
games.to_csv("games_num.csv")