# Задание 11

Дана информация о платформе стартапов. Научитесь предсказывать количество голосов (столбец `votesCount`).

Метрика: $20r^2$ (коэффициент детерминации)

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

import os
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

train_name = 'producthunt_train.csv'
test_name = 'producthunt_test.csv'
sample_name = 'sample_submission.csv'
submission_name = 'submission.csv'

# Hide GPU from TensorFlow
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

data_path = os.path.join('/tf','shared_data','profi-23', '11') + os.path.sep
train_path = data_path + train_name
test_path = data_path + test_name
sample_path = data_path + sample_name
submission_path = data_path + submission_name

In [2]:
train_csv = pd.read_csv(train_path, index_col='id')
test_csv = pd.read_csv(test_path, index_col='id')
train_csv

Unnamed: 0_level_0,slug,name,tagline,commentsCount,dateAdded,timeAdded,topics,votesCount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,superannotate-desktop,SuperAnnotate Desktop,Super easy and fast ML annotations on your PC,93,2020-10-29,07:18:00,"Windows, Mac, SaaS, Developer Tools, Artificia...",660
1,pricing-page,Pricing Page,No-code pricing pages with Stripe checkout,4,2021-06-20,15:31:23,"SaaS, E-Commerce",140
2,infinitemail,Infinitemail,The better way to get email marketing done for...,19,2023-06-05,07:23:32,"Design Tools, Email Marketing, Marketing",77
3,quarter-super-charge-powerbank,Quarter Super Charge Powerbank,The first powerbank using the MagSafe for supe...,10,2015-10-23,07:14:22,Tech,126
4,stickymappers,Stickymappers,The sexiest green stickers on the iMessage store.,3,2016-10-05,07:03:00,"Messaging, Tech",122
...,...,...,...,...,...,...,...,...
56697,reason8-ai-2-0,Reason8.AI 2.0,Turn conversations 💬 into summaries 📝,40,2018-05-09,07:01:00,"Android, iOS, Productivity, Meetings, Artifici...",507
56698,project-pulse,Project Pulse,Project status pages communicate progress for you,10,2015-03-09,15:08:03,Tech,81
56699,ux-toolkit-for-figma,UX Toolkit for Figma,"Create beautiful user flow charts, sitemaps an...",27,2020-07-23,07:33:21,"Design Tools, User Experience",548
56700,studiolight-co,studiolight.co,Airbnb for photo studios,4,2015-05-02,13:35:26,Tech,127


In [3]:
def preprocess(df):
    df['time'] = pd.to_datetime(df['dateAdded'] + ' ' + df['timeAdded'])
    df = df.drop(columns=['dateAdded', 'timeAdded', 'slug', 'name', 'tagline'])
    df.topics = df.topics.str.split(', ')
    df.topics = df.topics.apply(lambda x: [] if type(x) is not list else x)
    
    return df

In [4]:
train_csv = preprocess(train_csv)
test_csv = preprocess(test_csv)
all_topics = pd.concat([train_csv.topics, test_csv.topics])
all_times = pd.concat([train_csv.time, test_csv.time])
all_topics

id
0        [Windows, Mac, SaaS, Developer Tools, Artifici...
1                                       [SaaS, E-Commerce]
2               [Design Tools, Email Marketing, Marketing]
3                                                   [Tech]
4                                        [Messaging, Tech]
                               ...                        
69405           [iOS, iPad, Design Tools, Tech, Wallpaper]
69406                              [Email Marketing, Tech]
69407                         [Web App, Prototyping, Tech]
69408            [Android, iOS, Web App, Health & Fitness]
69409                            [iOS, Productivity, Tech]
Name: topics, Length: 69410, dtype: object

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

encoder = MultiLabelBinarizer()
encoder.fit(all_topics)

scaler = MinMaxScaler()
scaler.fit(all_times.to_numpy().reshape(-1, 1))
train_csv.time = scaler.transform(train_csv.time.to_numpy().reshape(-1, 1))
test_csv.time = scaler.transform(test_csv.time.to_numpy().reshape(-1, 1))

train_X = np.concatenate((train_csv.drop(columns=['votesCount', 'topics']).to_numpy(), encoder.transform(train_csv.topics)), axis=1)
test_X = np.concatenate((test_csv.drop(columns=['topics']).to_numpy(), encoder.transform(test_csv.topics)), axis=1)
train_y = train_csv['votesCount']

In [6]:
import lightgbm as lgbm

model = lgbm.LGBMRegressor(n_estimators=100)
model.fit(train_X, train_y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 56702, number of used features: 267
[LightGBM] [Info] Start training from score 287.430902


In [7]:
from sklearn.metrics import r2_score

score = r2_score(train_y, model.predict(train_X))
print(f"R^2-коэффициент для LightGBM: {score}")

R^2-коэффициент для LightGBM: 0.6685018307413679


In [8]:
test_csv['votesCount'] = model.predict(test_X)
test_csv = test_csv[['votesCount']]

test_csv.to_csv(submission_path, index=False)