<a href="https://colab.research.google.com/github/ManthanVerma7/TIET-SS-MiniProject-06-AdvancePython/blob/main/TIET_SS_MiniProject_06_AdvancePython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Part I: Process Automation**

In [None]:
# Q1: Create a file that contains 1000 lines of random strings.

import random
import string

with open("q1_random_1000_lines.txt", "w") as f:
    for _ in range(1000):
        random_str = ''.join(random.choices(string.ascii_letters + string.digits, k=20))
        f.write(random_str + '\n')

print("Created 'q1_random_1000_lines.txt'")



In [None]:
# Q2: Create a file that contains multiple lines of random strings and file size must be 5 MB.

with open("q2_random_5mb.txt", "w") as f:
    while f.tell() < 5 * 1024 * 1024:  # 5 MB
        line = ''.join(random.choices(string.ascii_letters + string.digits, k=100))
        f.write(line + '\n')


In [None]:
# Q3: Create 10 files that contain multiple lines of random strings and each file must be 5 MB.

for i in range(10):
    with open(f"q3_file_{i+1}.txt", "w") as f:
        while f.tell() < 5 * 1024 * 1024:
            line = ''.join(random.choices(string.ascii_letters + string.digits, k=100))
            f.write(line + '\n')


In [None]:
# Q4: Create 5 files of size 1GB to 5GB containing multiple lines of random strings.

sizes_gb = [1, 2, 3, 4, 5]

for size in sizes_gb:
    with open(f"q4_{size}GB.txt", "w") as f:
        target_bytes = size * 1024 * 1024 * 1024
        while f.tell() < target_bytes:
            line = ''.join(random.choices(string.ascii_letters + string.digits, k=100))
            f.write(line + '\n')


In [None]:
# Q5: Convert all the files of Q4 into upper case one by one.

import os

for size in sizes_gb:
    filename = f"q4_{size}GB.txt"
    with open(filename, "r") as f:
        content = f.read().upper()
    with open(f"q5_upper_{size}GB.txt", "w") as f:
        f.write(content)


In [None]:
# Q6: Convert all the files of Q4 into upper case using multi-threading.

import threading

def convert_upper_thread(file_size):
    input_file = f"q4_{file_size}GB.txt"
    output_file = f"q6_thread_upper_{file_size}GB.txt"
    with open(input_file, "r") as f:
        data = f.read().upper()
    with open(output_file, "w") as f:
        f.write(data)

threads = []
for size in sizes_gb:
    t = threading.Thread(target=convert_upper_thread, args=(size,))
    t.start()
    threads.append(t)

for t in threads:
    t.join()


In [None]:
# Q7: Automatically download 10 images of cat from Google Images.

from icrawler.builtin import GoogleImageCrawler

google_crawler = GoogleImageCrawler(storage={'root_dir': 'q7_cat_images'})
google_crawler.crawl(keyword='cat', max_num=10)

print("✅ Q7 complete: 10 cat images downloaded to 'q7_cat_images/'")


In [None]:
# Q8: Automatically download 10 videos of "Machine Learning" from YouTube.

from pytube import Search, YouTube
import os

search = Search("Machine Learning")
results = search.results[:10]

os.makedirs("q8_ml_videos", exist_ok=True)

for i, video in enumerate(results):
    stream = video.streams.filter(progressive=True, file_extension='mp4').first()
    stream.download(output_path="q8_ml_videos", filename=f"ml_video_{i+1}.mp4")

print("✅ Q8 complete: 10 Machine Learning videos downloaded.")


In [None]:
# Q9: Convert all the videos from Q8 to audio (MP3 format).

from moviepy.editor import VideoFileClip
import os

video_folder = "q8_ml_videos"
audio_folder = "q9_audio_output"
os.makedirs(audio_folder, exist_ok=True)

for video_file in os.listdir(video_folder):
    if video_file.endswith(".mp4"):
        video_path = os.path.join(video_folder, video_file)
        audio_path = os.path.join(audio_folder, video_file.replace(".mp4", ".mp3"))

        video_clip = VideoFileClip(video_path)
        video_clip.audio.write_audiofile(audio_path)

print("✅ Q9 complete: Audio files saved in 'q9_audio_output/'")


In [None]:
# Q10: Automated pipeline using multi-threading to download 100 YouTube videos and convert to audio.

from concurrent.futures import ThreadPoolExecutor
from pytube import Search
from moviepy.editor import VideoFileClip

search_results = Search("Machine Learning").results[:100]
os.makedirs("q10_pipeline_videos", exist_ok=True)
os.makedirs("q10_pipeline_audio", exist_ok=True)

def download_and_convert(video_obj, index):
    try:
        video = video_obj.streams.filter(progressive=True, file_extension='mp4').first()
        video_path = video.download(output_path="q10_pipeline_videos", filename=f"video_{index}.mp4")

        clip = VideoFileClip(video_path)
        clip.audio.write_audiofile(f"q10_pipeline_audio/audio_{index}.mp3")
    except Exception as e:
        print(f"Error processing video {index}: {e}")

with ThreadPoolExecutor(max_workers=5) as executor:
    for i, video_obj in enumerate(search_results):
        executor.submit(download_and_convert, video_obj, i)

print("✅ Q10 complete: Downloaded and converted 100 videos to audio.")


In [None]:
# Q11: Download 500 Dog images → Rescale them to 50%.

from PIL import Image
from icrawler.builtin import GoogleImageCrawler

download_dir = 'q11_dog_images'
resized_dir = 'q11_resized_dogs'

os.makedirs(download_dir, exist_ok=True)
os.makedirs(resized_dir, exist_ok=True)

# Step 1: Download
google_crawler = GoogleImageCrawler(storage={'root_dir': download_dir})
google_crawler.crawl(keyword='dog', max_num=500)

# Step 2: Rescale
for img_file in os.listdir(download_dir):
    if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        img_path = os.path.join(download_dir, img_file)
        try:
            img = Image.open(img_path)
            new_size = (img.size[0] // 2, img.size[1] // 2)
            resized = img.resize(new_size)
            resized.save(os.path.join(resized_dir, img_file))
        except:
            continue

print("✅ Q11 complete: 500 Dog images downloaded and resized to 50%.")


**Part II: Data Analytics**

In [None]:
# Q12: Create a random dataset of 100 rows and 30 columns with values between [1,200]
# (i) Replace values between 10 and 60 with NA and count rows with missing values
# (ii) Replace NA with column average
# (iii) Pearson correlation + heatmap, list columns with correlation < -0.7
# (iv) Normalize all values between 0 and 10
# (v) Replace values <= 0.5 with 1, else with 0

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Dataset generation
df = pd.DataFrame(np.random.randint(1, 201, size=(100, 30)))

# (i) Replace values in [10,60] with NaN
df_replaced = df.mask(df.between(10, 60))
missing_rows = df_replaced.isnull().any(axis=1).sum()
print(f"(i) Rows with missing values: {missing_rows}")

# (ii) Replace NaN with column mean
df_filled = df_replaced.fillna(df_replaced.mean())

# (iii) Pearson correlation matrix & heatmap
correlation_matrix = df_filled.corr(method='pearson')
sns.heatmap(correlation_matrix, cmap='coolwarm')
plt.title("Pearson Correlation Heatmap")
plt.show()

# Columns with correlation < -0.7 with any other column
low_corr = (correlation_matrix < -0.7).sum()
low_corr_columns = low_corr[low_corr > 0].index.tolist()
print(f"(iii) Columns with correlation < -0.7: {low_corr_columns}")

# (iv) Normalize between 0 and 10
df_normalized = 10 * (df_filled - df_filled.min()) / (df_filled.max() - df_filled.min())

# (v) Replace with 1 if value <= 0.5 else 0
df_binary = df_normalized.applymap(lambda x: 1 if x <= 0.5 else 0)


In [None]:
# Q13: Create 500×10 dataset
# Columns 1-4: [-10, 10], Columns 5-8: [10, 20], Columns 9-10: [-100, 100]
# Apply: K-Means and Hierarchical clustering with graphs

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler

# Dataset creation
data = pd.concat([
    pd.DataFrame(np.random.uniform(-10, 10, size=(500, 4))),
    pd.DataFrame(np.random.uniform(10, 20, size=(500, 4))),
    pd.DataFrame(np.random.uniform(-100, 100, size=(500, 2)))
], axis=1)

# Standardize
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# (i) K-Means clustering - Elbow method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(data_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.title("Elbow Method for K-Means")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

# (ii) Hierarchical clustering - Dendrogram
linked = linkage(data_scaled, method='ward')
plt.figure(figsize=(10, 5))
dendrogram(linked, truncate_mode='lastp', p=10)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()


In [None]:
# Q13: Create 500×10 dataset
# Columns 1-4: [-10, 10], Columns 5-8: [10, 20], Columns 9-10: [-100, 100]
# Apply: K-Means and Hierarchical clustering with graphs

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler

# Dataset creation
data = pd.concat([
    pd.DataFrame(np.random.uniform(-10, 10, size=(500, 4))),
    pd.DataFrame(np.random.uniform(10, 20, size=(500, 4))),
    pd.DataFrame(np.random.uniform(-100, 100, size=(500, 2)))
], axis=1)

# Standardize
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# (i) K-Means clustering - Elbow method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(data_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.title("Elbow Method for K-Means")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

# (ii) Hierarchical clustering - Dendrogram
linked = linkage(data_scaled, method='ward')
plt.figure(figsize=(10, 5))
dendrogram(linked, truncate_mode='lastp', p=10)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()


In [None]:
# Q14: Create dataset of size 600×15 with values in [-100, 100]
# Plot: (i) Scatter of Col 5 vs Col 6, (ii) Histogram of all, (iii) Box plot of all

df = pd.DataFrame(np.random.uniform(-100, 100, size=(600, 15)))

# (i) Scatter plot of Column 5 vs Column 6
plt.scatter(df[4], df[5], alpha=0.6)
plt.xlabel("Column 5")
plt.ylabel("Column 6")
plt.title("Scatter Plot: Column 5 vs Column 6")
plt.grid(True)
plt.show()

# (ii) Histograms
df.hist(figsize=(14, 10), bins=20, edgecolor='black')
plt.suptitle("Histogram of All Columns", fontsize=16)
plt.show()

# (iii) Box Plot
df.plot(kind='box', figsize=(14, 6), vert=False)
plt.title("Box Plot of All Columns")
plt.grid(True)
plt.show()


In [None]:
# Q15: Create dataset (500×5) with values [5,10]
# (i) T-test, (ii) Wilcoxon Signed Rank, (iii) Two Sample T-Test & Wilcoxon Rank-Sum

from scipy.stats import ttest_1samp, wilcoxon, ttest_ind, ranksums

df = pd.DataFrame(np.random.uniform(5, 10, size=(500, 5)))

# (i) T-Test against mean = 7.5
for col in df.columns:
    t_stat, p_val = ttest_1samp(df[col], 7.5)
    print(f"Column {col}: T-Test p-value = {p_val:.4f}")

# (ii) Wilcoxon Signed-Rank Test vs 7.5
for col in df.columns:
    try:
        w_stat, p_val = wilcoxon(df[col] - 7.5)
        print(f"Column {col}: Wilcoxon p-value = {p_val:.4f}")
    except:
        print(f"Column {col}: Wilcoxon test not applicable (tie values)")

# (iii) Two-Sample T-Test & Wilcoxon between Column 3 and 4
print("\nColumn 3 vs Column 4:")
print("T-Test:", ttest_ind(df[2], df[3]))
print("Wilcoxon Rank-Sum:", ranksums(df[2], df[3]))
