In [1]:
!pip install kaggle



In [2]:
import os

os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

!kaggle competitions download -c playground-series-s5e4

playground-series-s5e4.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
import zipfile

# 압축 풀기
with zipfile.ZipFile("playground-series-s5e4.zip", "r") as zip_ref:
    zip_ref.extractall("data")  # 또는 . 으로 현재 폴더에 풀 수도 있음

# 데이터 읽기 예시
import pandas as pd

train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/sample_submission.csv")


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [5]:
train.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [6]:
train.isnull().sum()

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [8]:
train['Episode_Length_minutes'] = train['Episode_Length_minutes'].fillna(train['Episode_Length_minutes'].median())
train['Guest_Popularity_percentage'] = train['Guest_Popularity_percentage'].fillna(0).astype(float)

test['Episode_Length_minutes'] = test['Episode_Length_minutes'].fillna(test['Episode_Length_minutes'].median())
test['Guest_Popularity_percentage'] = test['Guest_Popularity_percentage'].fillna(0).astype(float)

In [9]:
train['Number_of_Ads'] = train['Number_of_Ads'].fillna(train['Number_of_Ads'].median())
test['Number_of_Ads'] = test['Number_of_Ads'].fillna(test['Number_of_Ads'].median())

In [10]:
train.isnull().sum()
test.isnull().sum()

id                             0
Podcast_Name                   0
Episode_Title                  0
Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
dtype: int64

In [28]:
# object 컬럼 확인
print("Object 타입 컬럼 목록:")
print(train.select_dtypes(include='object').columns)

# 삭제 대상 지정
drop_cols = ['Podcast_Name', 'Episode_Title']

# 범주형 지정
categorical_cols = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

# concat 전에 삭제
train_drop = train.drop(columns=drop_cols + ['Listening_Time_minutes'])  # 타깃도 잠깐 제외
test_drop = test.drop(columns=drop_cols)

# concat
all_data = pd.concat([train_drop, test_drop], axis=0)

# dummies 적용
all_data_encoded = pd.get_dummies(all_data, columns=categorical_cols)

# 다시 분리
X_train = all_data_encoded.iloc[:len(train), :]
X_test = all_data_encoded.iloc[len(train):, :]
y_train = train['Listening_Time_minutes']


Object 타입 컬럼 목록:
Index(['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day',
       'Publication_Time', 'Episode_Sentiment'],
      dtype='object')


In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [30]:
print(train.dtypes)
print(test.dtypes)

id                               int64
Podcast_Name                    object
Episode_Title                   object
Episode_Length_minutes         float64
Genre                           object
Host_Popularity_percentage     float64
Publication_Day                 object
Publication_Time                object
Guest_Popularity_percentage    float64
Number_of_Ads                  float64
Episode_Sentiment               object
Listening_Time_minutes         float64
dtype: object
id                               int64
Podcast_Name                    object
Episode_Title                   object
Episode_Length_minutes         float64
Genre                           object
Host_Popularity_percentage     float64
Publication_Day                 object
Publication_Time                object
Guest_Popularity_percentage    float64
Number_of_Ads                  float64
Episode_Sentiment               object
dtype: object


In [34]:
# 검증용 분리
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# 학습
model = LinearRegression()
model.fit(X_tr, y_tr)


y_pred = model.predict(X_val)

In [35]:
test_preds = model.predict(X_test)

In [36]:
import pandas as pd

submission = pd.DataFrame({
    'id': test['id'].reset_index(drop=True),
    'Listening_Time_minutes': test_preds
})

In [37]:
submission.to_csv("submission0.csv", index=False)
print(submission.head())

       id  Listening_Time_minutes
0  750000               55.791563
1  750001               20.539834
2  750002               51.589714
3  750003               81.540706
4  750004               49.889434
