### 1.Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd

# Load the dataset
url = 'https://drive.google.com/uc?id=1Lff4UcvEu3cdCxWwSy-sRRIzHXiIqjnt'
data = pd.read_csv(url)

# Display the first 5 rows
print(data.head())


### Step 2: Check for missing values and handle them

In [None]:
# Check for missing values
print(data.isnull().sum())

# Handle missing values (dropping rows for simplicity)
data = data.dropna()

print("Missing values handled. Remaining rows:", data.shape[0])


### Step 3: Summary statistics for numerical columns

In [None]:
# Summary statistics
print(data.describe())


### Step 4: Visualize the distribution of units sold using a histogram

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Histogram of units sold
sns.histplot(data['Units Sold (million )'], kde=True, color='blue')
plt.title('Distribution of Units Sold')
plt.xlabel('Units Sold')
plt.ylabel('Frequency')
plt.show()


### Step 5: Plot the trend of units sold over the years using a line chart

In [None]:
# Line chart of units sold over the years
yearly_sales = data.groupby('Year')['Units Sold (million )'].sum().reset_index()

plt.plot(yearly_sales['Year'], yearly_sales['Units Sold (million )'], marker='o')
plt.title('Trend of Units Sold Over the Years')
plt.xlabel('Year')
plt.ylabel('Total Units Sold')
plt.grid(True)
plt.show()


#### 2. Statistical Analysis

#### Step 1: Perform a t-test for smartphones vs non-smartphones

In [6]:
from scipy.stats import ttest_ind

# Subset data
smartphones = data[data['Smartphone?'] == True]['Units Sold (million )']
non_smartphones = data[data['Smartphone?'] == False]['Units Sold (million )']

# T-test
t_stat, p_value = ttest_ind(smartphones,non_smartphones)
print("T-test Results: T-stat =", t_stat, ", P-value =", p_value)


T-test Results: T-stat = -3.3881329319072755 , P-value = 0.0009570509480181121


### Step 2: Correlation matrix and heatmap

In [None]:
# # Correlation matrix
# correlation = data.corr()

# # Heatmap
# sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
# plt.title('Correlation Matrix')
# plt.show()



# Select only numeric columns
data_numeric = data.select_dtypes(include=['number'])

# Compute correlation matrix on numeric columns only
correlation = data_numeric.corr()

# Heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


### 3. Data Manipulation with Pandas.

### Step 1: Group by manufacturer and calculate total units sold.

In [None]:
# Total units sold by manufacturer
manufacturer_sales = data.groupby('Manufacturer')['Units Sold (million )'].sum().sort_values(ascending=False)
print(manufacturer_sales)


### Step 2: Identify top 5 manufacturers

In [None]:
top_5_manufacturers = manufacturer_sales.head(5)
print("Top 5 Manufacturers:\n", top_5_manufacturers)


### Step 3: Create a pivot table of total units sold per year

In [None]:
pivot_table = data.pivot_table(values='Units Sold (million )', index='Manufacturer', columns='Year', aggfunc='sum', fill_value=0)
print(pivot_table)


### 4. Data Visualization with Matplotlib and Seaborn.

#### Step 1: Bar chart for top 10 best-selling mobile models.

In [None]:
# Top 10 mobile models
top_10_models = data.groupby('Model')['Units Sold (million )'].sum().sort_values(ascending=False).head(10)

sns.barplot(x=top_10_models.values, y=top_10_models.index, palette='viridis')
plt.title('Top 10 Best-Selling Mobile Models')
plt.xlabel('Units Sold')
plt.ylabel('Mobile Model')
plt.show()


### Step 2: Box plot for smartphones vs non-smartphones

In [None]:
sns.boxplot(x='Smartphone?', y='Units Sold (million )', data=data, palette='Set2')
plt.title('Units Sold: Smartphones vs Non-Smartphones')
plt.xlabel('Smartphone?')
plt.ylabel('Units Sold')
plt.show()


### 5. Machine Learning

### Step 1: Build a simple linear regression model

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data
X = data[['Year']]
y = data['Units Sold (million )']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("R-squared:", r2)
print("Mean Squared Error:", mse)


R-squared: 0.16169979253184785
Mean Squared Error: 3460.2428594265875
