In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor  # Modified import
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
data_chunks = pd.read_csv("car_prices.csv", chunksize=1000)  # Chunksize of 1000 rows

# Initialize lists to store accuracy scores and predictions
accuracy_scores = []
all_predictions = []

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), ['year']),  # Assuming 'year' is the only numerical feature
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['make', 'model', 'trim', 'body', 'transmission', 'vin', 'state', 'color', 'interior', 'seller', 'saledate'])
    ],
    remainder='passthrough'
)

# Initialize Decision Tree model 
dt_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))  # Modified regressor
])

# Perform 10-fold cross-validation on each chunk
for i, chunk in enumerate(data_chunks, 1):
    print(f"Processing chunk {i}...")
    # Drop rows with missing values
    chunk = chunk.dropna()

    # Extract features and target variable
    X = chunk.drop(columns=["sellingprice"])  # Features
    y = chunk["sellingprice"]  # Target variable

    # Initialize KFold
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # Initialize lists to store accuracy scores
    accuracy_scores_chunk = []

    # Perform 10-fold cross-validation
    for train_index, test_index in kf.split(X):
        # Split data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Fit the model
        dt_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = dt_model.predict(X_test)

        # Calculate accuracy (R-squared score)
        accuracy = dt_model.score(X_test, y_test)

        # Append accuracy to list
        accuracy_scores_chunk.append(accuracy)

    # Calculate average accuracy for the chunk
    avg_accuracy_chunk = np.mean(accuracy_scores_chunk)
    print(f"Average accuracy for chunk {i}: {avg_accuracy_chunk}")

    # Append average accuracy to list
    accuracy_scores.append(avg_accuracy_chunk)

    # Predict on the entire chunk
    chunk_predictions = dt_model.predict(X)

    # Store predictions in the list
    all_predictions.append(chunk_predictions)

    # Include all columns in predictions DataFrame
    predictions_df = pd.DataFrame({'sellingprice_predicted': chunk_predictions})
    predictions_df = pd.concat([chunk.drop(columns=["sellingprice"]), predictions_df], axis=1)

    # Save predictions to CSV file
    predictions_df.to_csv(f"decision_tree_predictions_for_chunk_{i}.csv", index=False)

# Calculate net accuracy across chunks
net_accuracy = np.mean(accuracy_scores)
print(f"Net accuracy across chunks: {net_accuracy}")

# Combine all predictions into a single DataFrame
all_predictions_df = pd.concat([pd.DataFrame(pred) for pred in all_predictions], ignore_index=True)

# Include all columns in all_predictions DataFrame
all_predictions_df = pd.concat([data.drop(columns=["sellingprice"]), all_predictions_df], axis=1)

# Save all predictions to a single CSV file
all_predictions_df.to_csv("all_predictions.csv", index=False)

print("Predictions saved successfully.")


Processing chunk 1...
Average accuracy for chunk 1: 0.9385061674897207
Processing chunk 2...
Average accuracy for chunk 2: 0.8895590377386433
Processing chunk 3...
Average accuracy for chunk 3: 0.8968956875768009
Processing chunk 4...
Average accuracy for chunk 4: 0.8308946896911349
Processing chunk 5...
Average accuracy for chunk 5: 0.8504032510150543
Processing chunk 6...
Average accuracy for chunk 6: 0.9627769520931254
Processing chunk 7...
Average accuracy for chunk 7: 0.9447334568309111
Processing chunk 8...
Average accuracy for chunk 8: 0.9451285154794691
Processing chunk 9...
Average accuracy for chunk 9: 0.8805738693541812
Processing chunk 10...
Average accuracy for chunk 10: 0.9114195817823209
Processing chunk 11...
Average accuracy for chunk 11: 0.9191144336977193
Processing chunk 12...
Average accuracy for chunk 12: 0.8733716033889151
Processing chunk 13...
Average accuracy for chunk 13: 0.8557067402750214
Processing chunk 14...


KeyboardInterrupt: 