<a href="https://colab.research.google.com/github/JasjitWalia/ComboFinder/blob/main/Advanced_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#creating file with 1000 random strings
import random
import string

with open("random_strings.txt", "w") as file:
    for _ in range(1000):
        random_string = ''.join(random.choices(string.ascii_lowercase, k=10))
        file.write(random_string + "\n")


In [2]:
# file with multiple lines of random strings and a file size of 5 MB
file_size_limit = 5 * 1024 * 1024  # 5 MB

with open("random_strings_large.txt", "w") as file:
    file_size = 0
    while file_size < file_size_limit:
        random_string = ''.join(random.choices(string.ascii_lowercase, k=10))
        line = random_string + "\n"
        line_size = len(line)
        
        if file_size + line_size <= file_size_limit:
            file.write(line)
            file_size += line_size
        else:
            break


In [3]:
#create 10 files, each containing multiple lines of random strings and with a file size of 5 MB
file_size_limit = 5 * 1024 * 1024  # 5 MB

for file_index in range(10):
    file_name = f"random_strings_{file_index}.txt"
    with open(file_name, "w") as file:
        file_size = 0
        while file_size < file_size_limit:
            random_string = ''.join(random.choices(string.ascii_lowercase, k=10))
            line = random_string + "\n"
            line_size = len(line)

            if file_size + line_size <= file_size_limit:
                file.write(line)
                file_size += line_size
            else:
                break


In [None]:
#To create 5 files of sizes 1GB, 2GB, 3GB, 4GB, and 5GB, each containing multiple lines of random strings
file_sizes = [1, 2, 3, 4, 5]  # in GB

for i, size in enumerate(file_sizes, start=1):
    file_size_limit = size * 1024 * 1024 * 1024  # Convert GB to bytes
    file_name = f"random_strings_{size}GB.txt"

    with open(file_name, "w") as file:
        file_size = 0
        while file_size < file_size_limit:
            random_string = ''.join(random.choices(string.ascii_lowercase, k=10))
            line = random_string + "\n"
            line_size = len(line)

            if file_size + line_size <= file_size_limit:
                file.write(line)
                file_size += line_size
            else:
                break


In [None]:
#convert all the files created in Q4 into uppercase
import os

file_sizes = [1, 2, 3, 4, 5]  # in GB

for size in file_sizes:
    file_name = f"random_strings_{size}GB.txt"
    output_file = f"random_strings_{size}GB_uppercase.txt"

    with open(file_name, "r") as file:
        content = file.read().upper()

    with open(output_file, "w") as file:
        file.write(content)

    os.remove(file_name)


In [None]:
#Converting all the files from Q4 into uppercase in parallel using multi-threading
import concurrent.futures

file_sizes = [1, 2, 3, 4, 5]  # in GB

def convert_to_uppercase(file_name):
    output_file = f"{file_name}_uppercase.txt"

    with open(file_name, "r") as file:
        content = file.read().upper()

    with open(output_file, "w") as file:
        file.write(content)

    os.remove(file_name)

with concurrent.futures.ThreadPoolExecutor() as executor:
    for size in file_sizes:
        file_name = f"random_strings_{size}GB.txt"
        executor.submit(convert_to_uppercase, file_name)


In [None]:
#To automatically download 10 images of cats from Google Images using a package from PyPI
from google_images_download import google_images_download

response = google_images_download.googleimagesdownload()

search_keywords = "cat"
num_images = 10

arguments = {
    "keywords": search_keywords,
    "limit": num_images,
    "format": "jpg",
    "output_directory": "cat_images",
}

response.download(arguments)


In [None]:
#To automatically download 10 videos of "Machine Learning" from YouTube using a package from PyPI
import pytube

search_keywords = "Machine Learning"
num_videos = 10

youtube = pytube.YouTube()
results = youtube.search(search_keywords, num_videos)

for video in results:
    video.streams.get_highest_resolution().download(output_path="machine_learning_videos")


In [None]:
#To convert all the videos downloaded in Q8 to audio using a package from PyPI
import moviepy.editor as mp

video_directory = "machine_learning_videos"
audio_directory = "machine_learning_audios"

for video_file in os.listdir(video_directory):
    video_path = os.path.join(video_directory, video_file)
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    audio_file = os.path.splitext(video_file)[0] + ".mp3"
    audio_path = os.path.join(audio_directory, audio_file)
    audio.write_audiofile(audio_path)


In [None]:
#Creating an automated pipeline using multi-threading for automatic downloading of 100 videos from YouTube and converting them to audio
import moviepy.editor as mp

search_keywords = "Machine Learning"
num_videos = 100

def download_video(video_url, output_directory):
    youtube = pytube.YouTube(video_url)
    video = youtube.streams.get_highest_resolution()
    video.download(output_directory)

def convert_to_audio(video_path, audio_directory):
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    audio_file = os.path.splitext(os.path.basename(video_path))[0] + ".mp3"
    audio_path = os.path.join(audio_directory, audio_file)
    audio.write_audiofile(audio_path)

video_directory = "machine_learning_videos"
audio_directory = "machine_learning_audios"

# Download videos in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    youtube = pytube.YouTube()
    results = youtube.search(search_keywords, num_videos)
    video_urls = [video.watch_url for video in results]
    executor.map(download_video, video_urls, [video_directory] * num_videos)

# Convert videos to audio in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    for video_file in os.listdir(video_directory):
        video_path = os.path.join(video_directory, video_file)
        executor.submit(convert_to_audio, video_path, audio_directory)


In [None]:
#Creating an automated pipeline using multi-threading for automatically downloading 500 images of dogs from Google Images and rescaling them to 50%
from PIL import Image

search_keywords = "dog"
num_images = 500
rescale_percentage = 50

def rescale_image(image_path, output_directory):
    image = Image.open(image_path)
    image_rescaled = image.resize((int(image.width * rescale_percentage / 100), int(image.height * rescale_percentage / 100)))
    output_path = os.path.join(output_directory, os.path.basename(image_path))
    image_rescaled.save(output_path)

image_directory = "dog_images_rescaled"

# Download images from Google Images
response = google_images_download.googleimagesdownload()
arguments = {
    "keywords": search_keywords,
    "limit": num_images,
    "format": "jpg",
    "output_directory": "dog_images",
}
response.download(arguments)

# Rescale images in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    for image_file in os.listdir("dog_images"):
        image_path = os.path.join("dog_images", image_file)
        executor.submit(rescale_image, image_path, image_directory)


In [None]:
#question 12
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Create random dataset
dataset = pd.DataFrame(np.random.randint(1, 201, size=(100, 30)))

# (i) Replace values with NA in the specified range and count rows with missing values
dataset.iloc[10:61] = np.nan
missing_rows_count = dataset.isna().any(axis=1).sum()
print(f"Number of rows with missing values: {missing_rows_count}")

# (ii) Replace NA values with column average
dataset = dataset.fillna(dataset.mean())

# (iii) Calculate Pearson correlation and plot heatmap
correlation = dataset.corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm")
plt.show()

# Select columns with correlation <= 0.7
selected_columns = correlation.columns[correlation.abs().max() <= 0.7]

# (iv) Normalize values between 0 and 10
normalized_dataset = (dataset - dataset.min()) / (dataset.max() - dataset.min()) * 10

# (v) Replace values with 1 if <= 0.5, else with 0
binary_dataset = dataset.applymap(lambda x: 1 if x <= 0.5 else 0)


In [None]:
# question 13
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage

# Set random seed for reproducibility
np.random.seed(42)

# Create random dataset
dataset = pd.DataFrame(np.random.uniform(-10, 10, size=(500, 10)), columns=[f"Column_{i}" for i in range(1, 11)])

# K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(dataset.iloc[:, :8])  # Use columns 1 to 8 for clustering

# Determine optimal number of clusters using the Elbow method
inertia_values = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(dataset.iloc[:, :8])
    inertia_values.append(kmeans.inertia_)
plt.plot(range(1, 11), inertia_values)
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

# Hierarchical clustering
dendrogram_data = linkage(dataset.iloc[:, 1:5], method='ward')
plt.figure(figsize=(10, 6))
dendrogram(dendrogram_data, labels=dataset.index, leaf_font_size=8)
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.title("Hierarchical Clustering Dendrogram")
plt.show()


In [None]:
#question 14
# Set random seed for reproducibility
np.random.seed(42)

# Create random dataset
dataset = pd.DataFrame(np.random.uniform(-100, 100, size=(600, 15)))

# (i) Scatter plot of Column 5 and Column 6
plt.scatter(dataset.iloc[:, 4], dataset.iloc[:, 5])
plt.xlabel("Column 5")
plt.ylabel("Column 6")
plt.title("Scatter Plot")
plt.show()

# (ii) Histogram of each column in a single graph
dataset.hist(figsize=(10, 6))
plt.tight_layout()
plt.show()

# (iii) Box plot of each column in a single graph
plt.figure(figsize=(10, 6))
sns.boxplot(data=dataset)
plt.xticks(rotation=90)
plt.title("Box Plot")
plt.show()


In [None]:
#question 15
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp, wilcoxon, ttest_ind, ranksums

# Set random seed for reproducibility
np.random.seed(42)

# Create random dataset
dataset = pd.DataFrame(np.random.uniform(5, 10, size=(500, 5)), columns=[f"Column_{i}" for i in range(1, 6)])

# (i) Perform t-Test on each column
t_test_results = {}
for column in dataset.columns:
    t_statistic, p_value = ttest_1samp(dataset[column], 5)
    t_test_results[column] = {"t-statistic": t_statistic, "p-value": p_value}

# (ii) Perform Wilcoxon Signed Rank Test on each column
wilcoxon_results = {}
for column in dataset.columns:
    statistic, p_value = wilcoxon(dataset[column] - 5)
    wilcoxon_results[column] = {"Statistic": statistic, "p-value": p_value}

# (iii) Perform Two Sample t-Test and Wilcoxon Rank Sum Test on Column 3 and Column 4
column_3 = dataset["Column_3"]
column_4 = dataset["Column_4"]
t_test_2samp_result = ttest_ind(column_3, column_4)
wilcoxon_ranksums_result = ranksums(column_3, column_4)
