In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input/sales-analysis/SalesKaggle3.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
import pandas as pd
import numpy as np

# For train/test splitting and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

In [3]:
# For preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

In [6]:
df = pd.read_csv("/kaggle/input/sales-analysis/SalesKaggle3.csv")

# Print basic info for sanity check
print("Initial data shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

Initial data shape: (198917, 14)
Columns: ['Order', 'File_Type', 'SKU_number', 'SoldFlag', 'SoldCount', 'MarketingType', 'ReleaseNumber', 'New_Release_Flag', 'StrengthFactor', 'PriceReg', 'ReleaseYear', 'ItemCount', 'LowUserPrice', 'LowNetPrice']


Unnamed: 0,Order,File_Type,SKU_number,SoldFlag,SoldCount,MarketingType,ReleaseNumber,New_Release_Flag,StrengthFactor,PriceReg,ReleaseYear,ItemCount,LowUserPrice,LowNetPrice
0,2,Historical,1737127,0.0,0.0,D,15,1,682743.0,44.99,2015,8,28.97,31.84
1,3,Historical,3255963,0.0,0.0,D,7,1,1016014.0,24.81,2005,39,0.0,15.54
2,4,Historical,612701,0.0,0.0,D,0,0,340464.0,46.0,2013,34,30.19,27.97
3,6,Historical,115883,1.0,1.0,D,4,1,334011.0,100.0,2006,20,133.93,83.15
4,7,Historical,863939,1.0,1.0,D,2,1,1287938.0,121.95,2010,28,4.0,23.99


In [7]:
# 2. Drop or Save Irrelevant Columns
if "Order" in df.columns:
    df.drop(columns=["Order"], inplace=True)


In [8]:
# Save SKU_number (which can act as an identifier) for later
sku_ids = df["SKU_number"].copy()

In [9]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
File_Type                0
SKU_number               0
SoldFlag            122921
SoldCount           122921
MarketingType            0
ReleaseNumber            0
New_Release_Flag         0
StrengthFactor           0
PriceReg                 0
ReleaseYear              0
ItemCount                0
LowUserPrice             0
LowNetPrice              0
dtype: int64


In [10]:
# For the target column 'SoldFlag', decide whether to drop or fill missing values.
# For example, you may want to fill missing SoldFlag values with 0.
if df["SoldFlag"].isnull().sum() > 0:
    print("Filling missing values in SoldFlag with 0.")
    df["SoldFlag"] = df["SoldFlag"].fillna(0)

Filling missing values in SoldFlag with 0.


In [11]:
# 4. Define Feature Set and Target Variable
# Drop target and identifier columns from features.
X = df.drop(columns=["SoldFlag", "SKU_number"])
# Convert target to integer type.
y = df["SoldFlag"].astype(int)


In [14]:
# 5. Build the Preprocessing Pipeline
# Define numeric and categorical features.

numeric_features = ["SoldCount", "ReleaseNumber", "StrengthFactor", "PriceReg"]
categorical_features = ["File_Type", "MarketingType", "New_Release_Flag"]

# Numeric pipeline: impute missing values (median) then scale them.
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline: impute missing values (most frequent) then one-hot encode.
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine them using ColumnTransformer.
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

In [15]:
# 6. Create the Full Pipeline with Random Forest

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])


In [16]:
# 7. Split Data into Train and Test Sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (159133, 11) Test shape: (39784, 11)


In [17]:
pipeline.fit(X_train, y_train)

In [18]:
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

In [19]:
# Print evaluation metrics.
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0
ROC AUC Score: 1.0
Confusion Matrix:
 [[37185     0]
 [    0  2599]]


In [20]:
predictions = pipeline.predict_proba(X)[:, 1]
df["Sale_Probability"] = predictions

# Optionally, reattach the SKU_number if needed
df["SKU_number"] = sku_ids

# Save the scored file for further use.
df.to_csv("scored_inventory_rf.csv", index=False)
print("Scored file saved as 'scored_inventory_rf.csv'")

Scored file saved as 'scored_inventory_rf.csv'


In [21]:
df.head()


Unnamed: 0,File_Type,SKU_number,SoldFlag,SoldCount,MarketingType,ReleaseNumber,New_Release_Flag,StrengthFactor,PriceReg,ReleaseYear,ItemCount,LowUserPrice,LowNetPrice,Sale_Probability
0,Historical,1737127,0.0,0.0,D,15,1,682743.0,44.99,2015,8,28.97,31.84,0.0
1,Historical,3255963,0.0,0.0,D,7,1,1016014.0,24.81,2005,39,0.0,15.54,0.0
2,Historical,612701,0.0,0.0,D,0,0,340464.0,46.0,2013,34,30.19,27.97,0.0
3,Historical,115883,1.0,1.0,D,4,1,334011.0,100.0,2006,20,133.93,83.15,1.0
4,Historical,863939,1.0,1.0,D,2,1,1287938.0,121.95,2010,28,4.0,23.99,1.0
