In [5]:



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from google.colab import files

# Step 1: Upload CSV
# Ensure 'customers-100.csv' is uploaded or available in the Colab environment
try:
  uploaded = files.upload()
except Exception as e:
  print(f"Error uploading file: {e}. Please ensure 'customers-100.csv' is in the current directory.")
  # If running outside Colab or file is already present, you might not need files.upload()
  # Check if the file exists before trying to read it
  import os
  if not os.path.exists("customers-100.csv"):
      raise FileNotFoundError("customers-100.csv not found. Please upload the file.")


# Step 2: Load & clean
df = pd.read_csv("customers-100.csv")
df = df[['City', 'Company', 'Country']].dropna()
df['Is_Foreign'] = (df['Country'] != 'India').astype(int)

# Check class distribution in the original data
print("Original Class Distribution (Is_Foreign):")
print(df['Is_Foreign'].value_counts())

# If the dataset is severely imbalanced (e.g., only one class exists),
# further steps might be needed like generating synthetic data or using a different dataset.
if df['Is_Foreign'].nunique() < 2:
    raise ValueError("The dataset contains only one class for 'Is_Foreign'. Please provide a dataset with at least two classes.")


# Step 3: Encode categorical features
df['City'] = LabelEncoder().fit_transform(df['City'])
df['Company'] = LabelEncoder().fit_transform(df['Company'])
df['Country'] = LabelEncoder().fit_transform(df['Country'])

# Step 4: Train & test model
X = df[['City', 'Company', 'Country']]
y = df['Is_Foreign']
# Use stratify to ensure class distribution is similar in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Check class distribution in the training data after splitting
print("\nTraining Class Distribution (y_train):")
print(y_train.value_counts())

model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Evaluate
pred = model.predict(X_test)
print("\n✅ Accuracy:", accuracy_score(y_test, pred))
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, pred))

Saving customers-100.csv to customers-100 (2).csv
Original Class Distribution (Is_Foreign):
Is_Foreign
1    100
Name: count, dtype: int64


ValueError: The dataset contains only one class for 'Is_Foreign'. Please provide a dataset with at least two classes.