In [10]:
#Task 1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

np.random.seed(42)
n_samples = 100

square_feet = np.random.normal(1800, 300, n_samples).astype(int)
bedrooms = np.random.choice([2, 3, 4, 5], size=n_samples)
bathrooms = np.random.choice([1, 2, 3], size=n_samples)
age = np.random.randint(0, 50, size=n_samples)
neighborhood = np.random.choice(['A', 'B', 'C'], size=n_samples)


price = (
    square_feet * 200 +
    bedrooms * 10000 +
    bathrooms * 15000 -
    age * 1000 +
    np.where(neighborhood == 'B', 25000, 0) +
    np.where(neighborhood == 'C', 40000, 0) +
    np.random.normal(0, 20000, n_samples)
).astype(int)


data = pd.DataFrame({
    'square_feet': square_feet,
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'age': age,
    'neighborhood': neighborhood,
    'price': price
})

df = pd.get_dummies(data, columns=['neighborhood'], drop_first=True)

X = df.drop('price', axis=1)
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


model = LinearRegression()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: Rs. {rmse:,.2f}")
print(f"R² Score: {r2:.3f}")


new_house = pd.DataFrame([{
    'square_feet': 2000,
    'bedrooms': 3,
    'bathrooms': 2,
    'age': 10,
    'neighborhood_B': 1,
    'neighborhood_C': 0
}])

predicted_price = model.predict(new_house)[0]
print(f"Predicted Price: Rs. {predicted_price:,.2f}")


RMSE: Rs. 19,618.80
R² Score: 0.918
Predicted Price: Rs. 476,116.65


In [16]:
#Task 2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

data = pd.DataFrame({
    'email': [
        "Win a FREE iPhone now!!! Click here http://spam.com",
        "Reminder: Your meeting is at 10am",
        "Congratulations, you have been selected for a prize!",
        "Hi John, can we reschedule our appointment?",
        "Get cheap meds online!!! http://buyspam.com",
        "Dear user, update your bank info here: http://fakebank.com",
        "Lunch at 1pm?",
        "URGENT: You won $10,000. Claim now!"
    ],
    'sender': [
        "promo@spam.com", "boss@company.com", "lottery@spam.org",
        "colleague@company.com", "pharmacy@spammed.com",
        "fraud@phish.com", "friend@mail.com", "winner@fakeprize.com"
    ],
    'label': [1, 0, 1, 0, 1, 1, 0, 1]
})


data['length'] = data['email'].apply(len)

data['has_link'] = data['email'].apply(lambda x: int('http' in x.lower()))

data['domain'] = data['sender'].apply(lambda x: x.split('@')[-1])

data = pd.get_dummies(data, columns=['domain'], drop_first=True)

vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(data['email'])

X_meta = data.drop(columns=['email', 'sender', 'label']).astype(np.float64)  # Ensure numeric type
from scipy.sparse import hstack
X = hstack([X_text, X_meta])
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

def classify_email(email_text, sender_email):
    length = len(email_text)
    has_link = int('http' in email_text.lower())
    domain = sender_email.split('@')[-1]

    email_tfidf = vectorizer.transform([email_text])

    domain_cols = [col for col in data.columns if col.startswith('domain_')]
    domain_data = pd.DataFrame([{f'domain_{domain}': 1 if f'domain_{domain}' == f'domain_{domain}' else 0 for domain in domain_cols}], columns=domain_cols)

    missing_cols = set(domain_cols) - set(domain_data.columns)
    for col in missing_cols:
        domain_data[col] = 0
    domain_data = domain_data[domain_cols]

    meta_features = pd.DataFrame([{
        'length': length,
        'has_link': has_link
    }])

    full_features = pd.concat([meta_features, domain_data], axis=1)

    full_features = full_features.fillna(0)

    from scipy.sparse import hstack
    X_new = hstack([email_tfidf, full_features])

    prediction = model.predict(X_new)[0]
    return "Spam" if prediction == 1 else "Not Spam"

new_email = "Hey, just checking if we’re still on for dinner tonight?"
new_sender = "friend@socialmail.com"
result = classify_email(new_email, new_sender)
print(f"📩 New email classified as: {result}")

🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

Accuracy: 0.50
📩 New email classified as: Spam


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
#Task 3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import numpy as np

data = pd.read_csv("customer_data.csv")

imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

for col in ['total_spending', 'age', 'number_of_visits']:
    upper_limit = data_imputed[col].quantile(0.95)
    lower_limit = data_imputed[col].quantile(0.05)
    data_imputed[col] = np.clip(data_imputed[col], lower_limit, upper_limit)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_imputed[['total_spending', 'age', 'number_of_visits', 'purchase_frequency']])

data_imputed[['total_spending', 'age', 'number_of_visits', 'purchase_frequency']] = scaled_features

X = data_imputed[['total_spending', 'age', 'number_of_visits', 'purchase_frequency']]
y = data_imputed['customer_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

coefficients = svm_model.coef_[0]
features = X.columns

print("\nFeature Importance (coefficients):")
for feature, coef in zip(features, coefficients):
    print(f"{feature}: {coef}")

print("\nExtracted Rules:")
for i, coef in enumerate(coefficients):
    if coef > 0:
        print(f"Increase {features[i]} -> Likely to be a high-value customer.")
    elif coef < 0:
        print(f"Decrease {features[i]} -> Likely to be a low-value customer.")

Classification Report:
              precision    recall  f1-score   support

         0.0       0.67      1.00      0.80         2
         1.0       1.00      0.67      0.80         3

    accuracy                           0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5

Accuracy: 0.8

Feature Importance (coefficients):
total_spending: 1.2208289329220823
age: -0.1834414056595162
number_of_visits: 0.14786521471931502
purchase_frequency: 0.5349561709003001

Extracted Rules:
Increase total_spending -> Likely to be a high-value customer.
Decrease age -> Likely to be a low-value customer.
Increase number_of_visits -> Likely to be a high-value customer.
Increase purchase_frequency -> Likely to be a high-value customer.
