In [None]:
%pip install kaggle python-dotenv pandas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
# Load environment variables from a .env file
import os
from dotenv import load_dotenv
load_dotenv()

# Set Kaggle API credentials as environment variables
KAGGLE_USERNAME = os.getenv('KAGGLE_USERNAME')
KAGGLE_KEY= os.getenv('KAGGLE_KEY')

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [None]:
!kaggle datasets download -d chaudharisanika/smartphones-dataset

In [None]:
#!unzip smartphones-dataset.zip -d smartphone-dataset

### SECTION 1: Exploratory Data Analysis

Q1. Load the dataset smartphones.csv using pandas.

In [None]:
df=pd.read_csv('smartphone-dataset/Smartphones_cleaned_dataset.csv')

Q2. Display the shape of the dataset.

In [None]:
df.shape

Q3. Show the first 10 rows of the dataset. 

In [None]:
df.head(10)

Q4. Find all rows where the price is more than 30,000.

In [None]:
high_price = df[df['price']>30000]
high_price.head()

Q5. Show all unique values in the Brand column.

In [None]:
brands = (df['brand_name'].unique())
print(brands)

Q6. Check for missing values in the dataset.

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

Q7. Find the average RAM of all smartphones.

In [None]:
avg_ram = df['ram_capacity'].mean()
print(avg_ram)

Q8. Count the number of phones per brand.

In [None]:
phone_per_brand = df['brand_name'].value_counts()
print(phone_per_brand)

Q9. Show the summary statistics for numeric columns.


In [None]:
df.describe().T

Q10. Which line would show you correlation between numeric columns?

In [None]:
# df.info()
df.corr(numeric_only=True).T

### SECTION 2: Visualization & Plotting

Q11. Import the required visualization libraries

Q12. Plot a histogram of Price

In [None]:
# Drop missing values if any
prices = df['price']
bins = 100000, 200000, 300000, 400000, 500000, 600000, 700000
# Plot the histogram
plt.hist(prices, color='skyblue', edgecolor='black', bins=bins)

# Add title and axis labels
plt.title("Histogram of Price")
plt.xlabel("Price Range")
plt.ylabel("Frequency")

# Show the plot
plt.show()

Q13. Create a bar plot showing average price per brand.

In [None]:
# Group by 'brand' and calculate average price
avg_price = df.groupby('brand_name')['price'].mean().sort_values(ascending=False)

# Plot bar chart
plt.figure(figsize=(10, 6))
plt.bar(avg_price.index, avg_price.values, color='skyblue', edgecolor='black')

# Add title and labels
plt.title("Average Price per Brand")
plt.xlabel("Brand")
plt.ylabel("Average Price")

# Rotate brand labels if needed
plt.xticks(rotation=90)

# Show the plot
plt.tight_layout()
plt.show()

Q14. Plot a heatmap of correlations.

In [None]:
# Calculate the correlation matrix
corr_matrix = df.corr(numeric_only=True)

# Set the plot size and style
plt.figure(figsize=(16, 12))

# Create the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Add a title
plt.title("Correlation Heatmap")

# Show the plot
plt.tight_layout()
plt.show()

Q15. Show the relationship between RAM and Price using a scatter plot.

In [None]:
# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df['ram_capacity'], df['price'], color='teal', edgecolors='black')

# Add title and axis labels
plt.title("Relationship Between RAM and Price")
plt.xlabel("RAM (GB)")
plt.ylabel("Price")

# Show the plot
plt.tight_layout()
plt.show()

Q16. Create a boxplot to visualize Price distribution across brands.

In [None]:
# Create the boxplot
plt.figure(figsize=(12, 8))
sns.boxplot(data=df, x='brand_name', y='price', color='Blue')

# Add title and labels
plt.title("Price Distribution Across Brands")
plt.xlabel("Brand")
plt.ylabel("Price")

# Rotate x-axis labels if brand names are long
plt.xticks(rotation=90)

# Show the plot
plt.tight_layout()
plt.show()

Q17. Add a title to a plot.

Q18. Plot a KDE (density plot) of prices.


In [None]:
# Plot KDE of prices
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='price', fill=True, color='purple')

# Add title and labels
plt.title("Price Distribution (KDE)")
plt.xlabel("Price")
plt.ylabel("Density")
plt.savefig('pricekde.png')
plt.tight_layout()
plt.show()

Q19. Rotate x-axis labels for readability.

Q20. Save a seaborn plot to a file.


### SECTION 3: Regression & Price Prediction

Q21. Import the required regression libraries.

Q22. Define features X and target y.


In [None]:
df.info()

In [None]:
object_cols = df.select_dtypes(include='object').columns
print(object_cols)

In [None]:
numerical_columns = df.select_dtypes(include='number')

In [None]:
for col in numerical_columns:
    df[col] = df[col].fillna(df[col].mean())

In [None]:
categorical_columns = df.select_dtypes(include='object')

In [None]:
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
df_encoded = pd.get_dummies(df,columns=['brand_name', 'model', 'processor_brand', 'os'],drop_first=True).astype(int)
df_encoded.head()

In [None]:
X = df_encoded.drop(columns=['price'])
y = df_encoded['price']

Q23. Split data into training and testing sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_test.shape)
print(X_train.shape)

Q24. Train a linear regression model.


In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

print(f'intercept:{model.intercept_}\n')
print(f'slope:{model.coef_}')

Q25. Make predictions on the test set.


In [None]:
y_pred = model.predict(X_test)

Q26. Calculate Mean Squared Error (MSE).

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(mse)

Q27. Print R² score.

In [None]:
print(r2_score(y_test, y_pred))

Q28. Add a new feature: Price_per_GB = Price / Storage.

In [None]:
df_encoded['price_per_gb'] = df_encoded['price']/df_encoded['internal_memory']

Q29. Scale features using StandardScaler

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# X_scaled =pd.DataFrame(scaler.fit_transform(df_encoded),columns=df_encoded.columns)


In [None]:
# Convert the scaled result back to a DataFrame for easier comparison
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df.head()

In [None]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

### SECTION 4: Feature Importance & Trend Analysis

Q31. Get feature importances from a trained decision tree.

In [None]:
# how much each feature has helped in making decisions
# Create a DataFrame for visualization
importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': tree.feature_importances_
})

In [None]:
importances_df.head()

In [None]:
# Sort and keep only the top 20 most important features
top_n = 5
importances_df = importances_df.sort_values(by='Importance', ascending=False).head(top_n)
importances_df.head()

Q32. Plot feature importances.

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importances_df)
plt.title(f'Top {top_n} Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

Q33. Select top 3 features using SelectKBest.

In [None]:
# Fit SelectKBest
selector = SelectKBest(score_func=f_regression, k=3)
selector.fit(X, y)

Q34. Show the score of each feature.

In [None]:
print(selector.scores_)

In [None]:
# Create a DataFrame of feature names and scores
scores_df = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
})

In [None]:
# Sort by score and pick top 3
top_features = scores_df.sort_values(by='Score', ascending=False).head(3)
top_features

Q35. Drop irrelevant columns like Model_Name.

In [None]:
dropped_col = df.drop(columns=['model'], inplace=True)

Q36. Sort data by RAM descending.

In [None]:
df.sort_values(by='ram_capacity', ascending=False)

Q37. Filter phones with RAM > 8GB and Battery > 4000mAh.

In [None]:
df[(df['ram_capacity'] > 8) & (df['battery_capacity'] > 4000)]

Q38. Group by Brand and get the average of all numeric features.

In [None]:
df.groupby('brand_name').mean(numeric_only=True)

In [None]:
df.columns

Q39. Count how many phones have dual SIM support.

Q40. Create a pairplot to examine pairwise feature trends.

In [None]:
sns.pairplot(df[['price', 'ram_capacity', 'battery_capacity', 'processor_speed']])
plt.suptitle('Pairplot of Key Numeric Features', y=1.02)
plt.show()

### SECTION 5: Train-Test Split & Feature Scaling

### Train-Test Split: Why & How
#### Q41. Why do we split data into training and testing sets?

A) To keep the dataset small

B) To check if the model can memorize data

C) To evaluate model performance on unseen data

D) To make training faster

#### Ans = C

We split our dataset into:

- Training set - to teach the model

- Testing set - to check how well it performs on new, unseen data

This helps us know:

- Is the model really learning patterns?

- Or is it just memorizing?

#### Q42. Fill in the code to split the data into 80% training and 20% testing.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Q43. What does random_state=42 do in train_test_split()?

A) Speeds up the model

B) Ensures reproducibility

C) Shuffles the labels

D) It’s required for all sklearn models

#### Ans = B

When you do a train_test_split(), it randomly shuffles and splits your data.

If you don't set random_state, you'll get different splits every time you run the code.

But if you set,then:

- You will always get the same split every time you run the code.

- This makes your results reproducible and easier to debug or share.

### Q44. What could happen if you evaluate your model on the same data it was trained on?

A) You'll get accurate performance

B) It will reflect real-world accuracy

C) It will lead to overfitting and misleading results

D) Nothing changes

#### Ans = C

If you evaluate your model on the same data it was trained on, it may perform very well but only because it memorized the answers.

This gives you:

- A false sense of accuracy

- Results that don’t reflect how it will perform on new, unseen data

That’s called overfitting — when the model learns the training data too well, including noise or specific examples, and fails on new data.

In Machine Learning Terms:
- A model that memorizes the training data = overfit
- A model that learns the patterns = generalizes to new data

### Feature Scaling: Why & How

#### Q45. Why do we scale features before training a model?

A) To make the data easier to store

B) To reduce RAM usage

C) To ensure all features contribute equally

D) To remove duplicates

#### Ans = C

### Q46. What is the most common scaler used in regression problems?

A) OneHotEncoder

B) MinMaxScaler

C) StandardScaler

D) LabelEncoder

#### Ans = C

In regression problems, we care about:

- Centering data (mean = 0)

- Equal feature importance (standard deviation = 1)

StandardScaler is the most common scaler because:

- It keeps outliers under control (better than MinMaxScaler)

- It makes models like Linear Regression and Ridge/Lasso work better

- It's great when the data follows a normal distribution (or close to it)

#### Q47. Fill in the missing code to scale features using StandardScaler.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

### Q48. If a feature like RAM ranges from 2 to 16 and Battery ranges from 2000 to 6000, what problem could occur if not scaled?

A) The RAM will dominate during training

B) The model will always predict zeros

C) Training will be faster

D) Battery will be ignored completely

#### Ans = A