In [None]:
# 라이브러리 불러오기
import pandas as pd
from google.colab import files

# 파일 업로드 확인
print("drugsComTrain_raw.csv와 drugsComTest_raw.csv 파일을 업로드하세요.")
uploaded = files.upload()

# 파일 이름 설정
train_file = "drugsComTrain_raw.csv"
test_file = "drugsComTest_raw.csv"

# CSV 파일 읽기
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# 데이터 확인
print("\n=== Train 데이터 미리 보기 ===")
print(train_df.head())
print("Train 데이터 크기:", train_df.shape)

print("\n=== Test 데이터 미리 보기 ===")
print(test_df.head())
print("Test 데이터 크기:", test_df.shape)


drugsComTrain_raw.csv와 drugsComTest_raw.csv 파일을 업로드하세요.


Saving drugsComTest_raw.csv to drugsComTest_raw.csv
Saving drugsComTrain_raw.csv to drugsComTrain_raw.csv

=== Train 데이터 미리 보기 ===
   uniqueID                  drugName                     condition  \
0    206461                 Valsartan  Left Ventricular Dysfunction   
1     95260                Guanfacine                          ADHD   
2     92703                    Lybrel                 Birth Control   
3    138000                Ortho Evra                 Birth Control   
4     35696  Buprenorphine / naloxone             Opiate Dependence   

                                              review  rating       date  \
0  "It has no side effect, I take it in combinati...       9  20-May-12   
1  "My son is halfway through his fourth week of ...       8  27-Apr-10   
2  "I used to take another oral contraceptive, wh...       5  14-Dec-09   
3  "This is my first time using any form of birth...       8   3-Nov-15   
4  "Suboxone has completely turned my life around...       9  27-No

In [None]:
# 결측치 처리
train_df = train_df.dropna(subset=['condition', 'review', 'rating'])
test_df = test_df.dropna(subset=['condition', 'review', 'rating'])

# 중복 제거
train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

# 텍스트 전처리
import re
def clean_text(text):
    text = text.lower()  # 소문자 변환
    text = re.sub(r"<.*?>", "", text)  # HTML 태그 제거
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # 알파벳과 공백만 남김
    text = re.sub(r"\s+", " ", text).strip()  # 불필요한 공백 제거
    return text

train_df['cleaned_review'] = train_df['review'].apply(clean_text)
test_df['cleaned_review'] = test_df['review'].apply(clean_text)

# 숫자형 데이터 정리 (Rating)
train_df['rating'] = train_df['rating'].astype(float)
test_df['rating'] = test_df['rating'].astype(float)

# 전처리 결과 확인
print("\n=== 전처리된 Train 데이터 ===")
print(train_df[['condition', 'cleaned_review', 'rating']].head())
print("Train 데이터 크기:", train_df.shape)

print("\n=== 전처리된 Test 데이터 ===")
print(test_df[['condition', 'cleaned_review', 'rating']].head())
print("Test 데이터 크기:", test_df.shape)

# 파일 저장
train_df.to_csv("processed_drug_review_train.csv", index=False)
test_df.to_csv("processed_drug_review_test.csv", index=False)
print("\n전처리된 데이터가 저장되었습니다: 'processed_drug_review_train.csv', 'processed_drug_review_test.csv'")



=== 전처리된 Train 데이터 ===
                      condition  \
0  Left Ventricular Dysfunction   
1                          ADHD   
2                 Birth Control   
3                 Birth Control   
4             Opiate Dependence   

                                      cleaned_review  rating  
0  it has no side effect i take it in combination...     9.0  
1  my son is halfway through his fourth week of i...     8.0  
2  i used to take another oral contraceptive whic...     5.0  
3  this is my first time using any form of birth ...     8.0  
4  suboxone has completely turned my life around ...     9.0  
Train 데이터 크기: (160398, 8)

=== 전처리된 Test 데이터 ===
                      condition  \
0                    Depression   
1  Crohn's Disease, Maintenance   
2       Urinary Tract Infection   
3                   Weight Loss   
4                 Birth Control   

                                      cleaned_review  rating  
0  ive tried a few antidepressants over the years...    10.0  
1

In [None]:
# 라이브러리 불러오기
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1. 데이터 로드
train_df = pd.read_csv("processed_drug_review_train.csv")
test_df = pd.read_csv("processed_drug_review_test.csv")

# 2. 텍스트 벡터화 (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)  # 최대 5000개의 특성 생성
X = vectorizer.fit_transform(train_df['cleaned_review'])
y = train_df['rating']

# 3. Train-Test 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 모델 학습 (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# 5. 예측 및 평가
y_pred = model.predict(X_val)

# 평가 지표
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("\n=== 회귀 모델 평가 ===")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# 6. 테스트 데이터 예측
X_test = vectorizer.transform(test_df['cleaned_review'])
test_predictions = model.predict(X_test)

# 테스트 데이터 결과 저장
test_df['predicted_rating'] = test_predictions
test_df.to_csv("test_predictions.csv", index=False)
print("\n테스트 데이터 예측 결과가 저장되었습니다: 'test_predictions.csv'")


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
# 라이브러리 불러오기
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 1. 데이터 로드
train_df = pd.read_csv("processed_drug_review_train.csv")
test_df = pd.read_csv("processed_drug_review_test.csv")

# 2. 결측치 확인 및 제거
train_df = train_df.dropna(subset=['cleaned_review'])
test_df = test_df.dropna(subset=['cleaned_review'])

# 3. 텍스트 벡터화 (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)  # 최대 5000개의 특성 생성
X = vectorizer.fit_transform(train_df['cleaned_review'])
y = train_df['rating']

# 4. Train-Test 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. 모델 학습 (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# 6. 예측 및 평가
y_pred = model.predict(X_val)

# 평가 지표
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("\n=== 회귀 모델 평가 ===")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# 7. 테스트 데이터 예측
X_test = vectorizer.transform(test_df['cleaned_review'])
test_predictions = model.predict(X_test)

# 테스트 데이터 결과 저장
test_df['predicted_rating'] = test_predictions
test_df.to_csv("test_predictions.csv", index=False)
print("\n테스트 데이터 예측 결과가 저장되었습니다: 'test_predictions.csv'")



=== 회귀 모델 평가 ===
Mean Squared Error (MSE): 5.60
R-squared (R²): 0.47

테스트 데이터 예측 결과가 저장되었습니다: 'test_predictions.csv'


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regressor 모델 학습
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# 예측 및 평가
y_rf_pred = rf_model.predict(X_val)

# 평가 지표
mse_rf = mean_squared_error(y_val, y_rf_pred)
r2_rf = r2_score(y_val, y_rf_pred)

print("\n=== Random Forest 회귀 모델 평가 ===")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"R-squared (R²): {r2_rf:.2f}")



=== Random Forest 회귀 모델 평가 ===
Mean Squared Error (MSE): 8.62
R-squared (R²): 0.19


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting Regressor 모델 학습
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)

# 예측 및 평가
y_gb_pred = gb_model.predict(X_val)

# 평가 지표
mse_gb = mean_squared_error(y_val, y_gb_pred)
r2_gb = r2_score(y_val, y_gb_pred)

print("\n=== Gradient Boosting 회귀 모델 평가 ===")
print(f"Mean Squared Error (MSE): {mse_gb:.2f}")
print(f"R-squared (R²): {r2_gb:.2f}")



=== Gradient Boosting 회귀 모델 평가 ===
Mean Squared Error (MSE): 7.08
R-squared (R²): 0.34
