# Airbnb price prediction modeling competition hosted by SIGNATE

* This competition is already terminated and is no longer accessible on the site
* The submission with the following code was ranked 8th / 931 participants

### 1. Preparation : word extraction for the scoring of natural language columns

##### 1-1. Extract words from the description column and the name column, and assign the median value to each word

In [None]:
# import data
df=pd.read_csv("train.csv", index_col=0)

#import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

#Create descname columns by concatenating the description column and the name column
df["descname"] = df["description"] + " " + df["name"]

In [None]:
# Victorization with binary = True to  を指定して、単語の重複を無視したベクトル化
vectorizer = CountVectorizer(stop_words='english', binary=True)
X = vectorizer.fit_transform(df['descname'])

# Obtain the unique word list and the number of each word used
word_counts = X.toarray().sum(axis=0)
word_list = vectorizer.get_feature_names_out()

# Convert the word list and each word's count into a DataFrame
word_frequency = pd.DataFrame({'word': word_list, 'count': word_counts})


In [None]:
# Get the price values list
y_values = df['y'].values

# Calculate the median value of y by the rows with each word in it
median_prices = []
for i, word in enumerate(word_list):
    rows_with_word = X[:, i].toarray().flatten()  # rows that have a certain word
    if rows_with_word.sum() > 0:  
        relevant_y_values = y_values[rows_with_word == 1]  
        median_price = np.median(relevant_y_values) 
    else:
        median_price = np.nan  
    median_prices.append(median_price)

# Add the median prices in the DataFrame
word_frequency['median_price'] = median_prices

# Filter the words by the count >= 10
filtered_word_frequency = word_frequency[word_frequency['count'] >= 10]

# Export the DataFrame to .xlsx
filtered_word_frequency.sort_values(by='count', ascending=False).to_excel('Wordlist.xlsx', index=False)

##### 1-2. Decompose amenity column strings into words
* *After exporting the dataframe, I gave a point to each amenity item manually on a scale of 0 to 4 by how luxurious the facility I thought it would imply.*<br>
* *However, if it is too cumbersome, it is also possible to use the median value by item, with the same method as the description/name columns*

In [None]:
import re
from collections import defaultdict

dic = defaultdict(int)

for _ in range(df["amenities"].shape[0]):
  keys = re.findall(r'\{(.*)\}', df.loc[_,"amenities"])[0].split(",")
  for key in keys:
    dic[key] += 1

import pandas as pd

# convert the dict into a DataFrame
amenities_df = pd.DataFrame(list(dic.items()), columns=['amenity', 'count'])

# Export the DataFrame into .xlsx file
amenities_df.to_excel('amenity_scores.xlsx', index=False)

### 2. Import necessary data

In [None]:
df_raw=pd.read_csv("train.csv", index_col=0)
amenity_scores_df = pd.read_excel("amenity_scores.xlsx")
word_scores_df = pd.read_excel("Wordlist.xlsx", sheet_name="Sheet1")

#Convert the scoring data to dict
amenity_scores_dict = pd.Series(amenity_scores_df.score.values, index=amenity_scores_df.word).to_dict()
word_scores_dict = pd.Series(word_scores_df.score.values, index=word_scores_df.word).to_dict()

### 3. Data Preprocessinng

In [None]:
def data_pre(df):

  #bathrooms,bedrooms,beds列の欠損値処理
  for column in ['bathrooms', 'bedrooms', 'beds']:
      if df[column].isnull().sum() > 0:
          medians = df.groupby('accommodates')[column].median()

          df[column] = df.apply(
              lambda row: medians[row['accommodates']] if pd.isnull(row[column]) else row[column], axis=1
          )

  #review_scores_rating,host_identity_verified列の欠損値処理
  df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].median())
  df['host_identity_verified'] = df['host_identity_verified'].fillna('f')
  df["host_has_profile_pic"] = df["host_has_profile_pic"].fillna('f')

  #host_response_rate列を数値変換＋欠損値処理
  df["host_response_rate"] = df["host_response_rate"].str.rstrip('%').astype('float')
  df['host_response_rate'] = df['host_response_rate'].fillna(df['host_response_rate'].median())

  #thumbnail列を「入力あり=1,Null=0」に変換
  df['thumbnail_url'] = df['thumbnail_url'].notna().astype(int)

  #日付列の欠損値処理
  df['host_since'] = pd.to_datetime(df['host_since'])
  df['host_since']=df['host_since'].fillna(df['host_since'].median())
  df["first_review"] = pd.to_datetime(df["first_review"])
  df['first_review']=df['first_review'].fillna(df['host_since'])
  df["last_review"] = pd.to_datetime(df["last_review"])
  df['last_review']=df['last_review'].fillna(df['host_since'])

  #日付列をfloat型変換
  df['host_days_since'] = (pd.Timestamp('2017-10-05') - df['host_since']).dt.days
  df["first_review"] = df["first_review"].astype('int64') // 10**9
  df["last_review"] = df["last_review"].astype('int64') // 10**9

  #amenity列のスコアリング
  df['cleaned_amenities'] = df['amenities'].str.replace('"', '', regex=False)
  def calculate_amenity_score(cleaned_amenities):
      total_score = 0
      keys = re.findall(r'\{(.*)\}', cleaned_amenities)
      if keys:
          for key in keys[0].split(","):
              key = key.strip()
              total_score += amenity_scores_dict.get(key, 0)
      return total_score
  df['amenity_scores'] = df['cleaned_amenities'].apply(calculate_amenity_score)

  #name列のスコアリング
  def calculate_name_score(name):
      total_score = 0
      words = name.split()  # nameを単語に分割
      for word in words:
          word = word.strip().lower()  # 比較を小文字で行うため、単語を小文字に変換
          if word in word_scores_dict:
              total_score += word_scores_dict[word]  # 単語が辞書にある場合、スコアを加算
      return total_score
  df['name_scores'] = df['name'].apply(calculate_name_score)

  #description列のスコアリング
  def calculate_description_score(description):
      total_score = 0
      words = description.split()
      for word in words:
          word = word.strip().lower()
          if word in word_scores_dict:
              total_score += word_scores_dict[word]
      return total_score
  df['description_scores'] = df['description'].apply(calculate_description_score)

  #description_wordcount列の作成
  df["description_wordcount"] = df["description"].apply(lambda x: len(str(x).split()))

  #不要なカラムを除去
  drop_list=["amenities","city","description","neighbourhood","name",'cleaned_amenities',"host_since","zipcode"]
  df=df.drop(drop_list,axis=1)

  #object型のカラムをcategory型に変換
  object_cols = df.select_dtypes(include='object').columns
  df[object_cols] = df[object_cols].astype('category')

  return df

df=data_pre(df)

### 4. Data Modeling

In [None]:
import optuna.integration.lightgbm as lgb_tune
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# データの分割
df_train, df_val = train_test_split(df, test_size=0.2)

col = "y"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

# LightGBMデータセットの作成
trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

# モデルパラメータの設定
params = {
    "objective": "regression",
    "metric": "rmse",
}

model_tune = lgb_tune.train(
    params, trains, valid_sets=[valids],
    callbacks=[
        lgb.early_stopping(100),  # 早期停止を設定
        lgb.log_evaluation(100)   # 100ラウンドごとにログを表示
    ]
)

val_preds = model_tune.predict(val_x)
val_rmse = mean_squared_error(val_y, val_preds, squared=False)
print(f'Validation RMSE: {val_rmse}')

df_test=pd.read_csv("/content/drive/MyDrive/初回成果物/民泊価格予測/test.csv", index_col=0)
df_test=data_pre(df_test)

predict = model_tune.predict(df_test)

### 5. Postprocessing and export of the submission file

In [None]:

# 予測値が10ドル以下の場合は10ドルに置き換える
predict = [max(10, pred) for pred in predict]

df_test["y"] = predict
df_test["y"].to_csv("submission.csv", header=False)