In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import joblib
import re

# Function to extract model, capacity, and color from the title
def extract_model_capacity_color(title):
    model_patterns = [
        r'아이폰\s?X', r'아이폰\s?XS', r'XS\s?맥스', r'아이폰\s?XR', r'아이폰\s?11\s?프로\s?맥스', r'아이폰\s?11\s?프로', 
        r'아이폰\s?11', r'아이폰\s?SE\s?2세대', r'아이폰\s?SE\s?1', r'아이폰\s?SE\s?3', r'아이폰\s?12\s?미니',
        r'아이폰\s?12\s?프로\s?맥스', r'아이폰\s?12\s?프로', r'아이폰\s?12', r'아이폰\s?13\s?프로\s?맥스',
        r'아이폰\s?13\s?프로', r'아이폰\s?13\s?미니', r'아이폰\s?13', r'아이폰\s?14\s?플러스', r'아이폰\s?14\s?프로\s?맥스',
        r'아이폰\s?14\s?프로', r'아이폰\s?14', r'아이폰\s?15\s?프로\s?맥스', r'아이폰\s?15\s?프로', r'아이폰\s?15'
    ]
    color_patterns = [
        r'블랙', r'블루', r'옐로우', r'핑크', r'그린', r'그레이', r'실버', r'레드', r'퍼플', r'골드'
    ]
    capacity_patterns = [r'\b64\b', r'\b128\b', r'\b256\b', r'\b512\b']

    model = None
    color = None
    capacity = None

    for pattern in model_patterns:
        if re.search(pattern, title, re.IGNORECASE):
            model = re.search(pattern, title, re.IGNORECASE).group()
            model = model.replace(" ", "")
            break

    for pattern in color_patterns:
        if re.search(pattern, title, re.IGNORECASE):
            color = re.search(pattern, title, re.IGNORECASE).group()
            break

    for pattern in capacity_patterns:
        if re.search(pattern, title):
            capacity = re.search(pattern, title).group() + "GB"
            break

    return model, capacity, color

# Load the dataset
df = pd.read_csv('"C:\ReactClass\iphon_data.csv"')

# Apply the extraction functions
df['model'], df['capacity'], df['color'] = zip(*df['title'].apply(extract_model_capacity_color))

# Encode categorical variables
label_encoders = {}
for column in ['model', 'capacity', 'color']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Prepare features and target variable
X = df[['model', 'capacity', 'color', 'battery_efficiency', 'condition']]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Save the model to a file
joblib.dump(xgb_model, 'xgboost_price_model.joblib')

# Save the label encoders as well
joblib.dump(label_encoders, 'label_encoders.joblib')

print("Model and encoders saved successfully!")


OSError: [Errno 22] Invalid argument: '"C:\\ReactClass\\iphon_data.csv"'