In [None]:
# 1. TF-IDF Analysis Task:
# Create a mini search engine for a set of documents.
# -Task:
#   -  you should come up with a corpus of documents (these could be articles, book chapters, or any collection of text).
#   -  you need to write a program that calculates the TF-IDF score for each word in each document.
#  -  you should then create a simple search function that uses the TF-IDF scores to find and rank documents based on a query of one or more words.
# -Extension:
#   - You could expand the search engine to include basic preprocessing of the text such as tokenization, stopping, stemming, or lemmatization.
#   - Discuss how different preprocessing steps might affect the TF-IDF scores and the search results.


In [None]:
import nltk
import math
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
car_documents = [
    "The automotive industry is undergoing rapid advancements in electric vehicles.",
    "Many car manufacturers are investing heavily in autonomous driving technology.",
    "Fuel efficiency has become a key focus for consumers when purchasing new vehicles.",
    "Classic cars hold a special place in the hearts of many enthusiasts.",
    "Safety features such as automatic emergency braking and adaptive cruise control are becoming standard in modern cars.",
    "The future of mobility is expected to be shaped by innovations in connected vehicles and smart transportation systems.",
    "Vintage car shows attract car lovers from around the world to admire rare and well-preserved automobiles.",
    "Car racing remains a popular sport, with events like Formula 1 drawing large crowds and global television audiences.",
    "Electric cars are becoming increasingly mainstream as governments push for greener transportation options.",
    "Car enthusiasts often gather at car meets and rallies to showcase their vehicles and share their passion for cars.",
]

In [None]:
def preprocess_documents(documents):
    preprocessed_documents = []
    for doc in documents:
        tokens = word_tokenize(doc.lower())
        tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
        preprocessed_documents.append(" ".join(tokens))
    return preprocessed_documents

In [None]:
preprocessed_car_documents = preprocess_documents(car_documents)

preprocessed_car_documents

['automotive industry undergoing rapid advancements electric vehicles',
 'many car manufacturers investing heavily autonomous driving technology',
 'fuel efficiency become key focus consumers purchasing new vehicles',
 'classic cars hold special place hearts many enthusiasts',
 'safety features automatic emergency braking adaptive cruise control becoming standard modern cars',
 'future mobility expected shaped innovations connected vehicles smart transportation systems',
 'vintage car shows attract car lovers around world admire rare automobiles',
 'car racing remains popular sport events like formula 1 drawing large crowds global television audiences',
 'electric cars becoming increasingly mainstream governments push greener transportation options',
 'car enthusiasts often gather car meets rallies showcase vehicles share passion cars']

In [None]:
# Calculate TF-IDF scores using TfidfVectorizer
def calculate_tf_idf_scores(preprocessed_documents):
    vectorizer = TfidfVectorizer()
    tf_idf_matrix = vectorizer.fit_transform(preprocessed_documents)
    feature_names = vectorizer.get_feature_names_out()

    tf_idf_scores = []
    for row in tf_idf_matrix:
        doc_scores = {}
        for word_idx, score in zip(row.indices, row.data):
            word = feature_names[word_idx]
            doc_scores[word] = score
        tf_idf_scores.append(doc_scores)

    return tf_idf_scores

In [None]:
tf_idf_scores_car = calculate_tf_idf_scores(preprocessed_car_documents)

tf_idf_scores_car

[{'vehicles': 0.26641929917719137,
  'electric': 0.3425149376316455,
  'advancements': 0.4029154438651177,
  'rapid': 0.4029154438651177,
  'undergoing': 0.4029154438651177,
  'industry': 0.4029154438651177,
  'automotive': 0.4029154438651177},
 {'technology': 0.37372070791781686,
  'driving': 0.37372070791781686,
  'autonomous': 0.37372070791781686,
  'heavily': 0.37372070791781686,
  'investing': 0.37372070791781686,
  'manufacturers': 0.37372070791781686,
  'car': 0.24711489869025727,
  'many': 0.3176967448459908},
 {'new': 0.34427082653791113,
  'purchasing': 0.34427082653791113,
  'consumers': 0.34427082653791113,
  'focus': 0.34427082653791113,
  'key': 0.34427082653791113,
  'become': 0.34427082653791113,
  'efficiency': 0.34427082653791113,
  'fuel': 0.34427082653791113,
  'vehicles': 0.22764178869273513},
 {'enthusiasts': 0.3240346206695762,
  'hearts': 0.38117623107916926,
  'place': 0.38117623107916926,
  'special': 0.38117623107916926,
  'hold': 0.38117623107916926,
  'cars

In [None]:
# Define search function based on TF-IDF scores
def search_car(query, documents, tf_idf_scores):
    query_tokens = [token for token in word_tokenize(query.lower()) if token.isalnum()]
    scores = []
    for idx, doc in enumerate(tf_idf_scores):
        doc_score = sum(doc.get(word, 0) for word in query_tokens)
        scores.append((idx, doc_score))
    ranked_results = sorted(scores, key=lambda x: x[1], reverse=True)
    return [documents[idx] for idx, _ in ranked_results]

In [None]:
# Prompt the user to input their query
query = input("Enter your query: ")

# Call the search function with the user's query
results = search_car(query, car_documents, tf_idf_scores_car)

# Print the search results
print("Search Results for query '{}':".format(query))
for idx, result in enumerate(results):
    print("{}. {}".format(idx+1, result))

Enter your query: automatic
Search Results for query 'automatic':
1. Safety features such as automatic emergency braking and adaptive cruise control are becoming standard in modern cars.
2. The automotive industry is undergoing rapid advancements in electric vehicles.
3. Many car manufacturers are investing heavily in autonomous driving technology.
4. Fuel efficiency has become a key focus for consumers when purchasing new vehicles.
5. Classic cars hold a special place in the hearts of many enthusiasts.
6. The future of mobility is expected to be shaped by innovations in connected vehicles and smart transportation systems.
7. Vintage car shows attract car lovers from around the world to admire rare and well-preserved automobiles.
8. Car racing remains a popular sport, with events like Formula 1 drawing large crowds and global television audiences.
9. Electric cars are becoming increasingly mainstream as governments push for greener transportation options.
10. Car enthusiasts often gath

### Task 2

In [None]:
# N-gram Model Task:
#   - come up  with a text dataset (like a set of tweets, sentences from books, or movie subtitles).or u can use previously used data
#   - you will create an N-gram model that predicts the next word(s) based on the previous word(s) in a sequence.
#   - you should then use their model to generate new sentences based on a given starting sequence of words.
# - Extension:
#     - Explore the effects of smoothing techniques on the model's performance.

In [None]:
from nltk import trigrams
import random

# Define the corpus of car-related sentences
car_sentences = [
    "Vehicle automotive industry is undergoing rapid advancements in electric vehicles.",
    "Many car manufacturers are investing heavily in autonomous driving technology.",
    "Fuel efficiency has become a key focus for consumers when purchasing new vehicles.",
    "Classic cars hold a special place in the hearts of many enthusiasts.",
    "Safety features such as automatic emergency braking and adaptive cruise control are becoming standard in modern cars.",
]

# Preprocess the corpus
corpus = " ".join(car_sentences).lower().split()

# Create trigrams
tri_grams = list(trigrams(corpus))

# Function to predict the next word based on the trigrams
def predict_next_word_tri_gram(tri_grams, prefix):
    possible_next_words = [word for first_word, second_word, word in tri_grams if (first_word, second_word) == prefix]
    if not possible_next_words:
        print("Prefix not found in the corpus.")
        return None
    next_word = random.choice(possible_next_words)
    return next_word

# Function to generate a new sentence based on a given starting sequence of words
def generate_sentence_tri_gram(tri_grams, start_sequence, max_length=20):
    sentence = list(start_sequence)
    while len(sentence) < max_length:
        next_word = predict_next_word_tri_gram(tri_grams, (sentence[-2], sentence[-1]))
        if next_word is None:
            print("No words found to continue the sentence.")
            break
        sentence.append(next_word)
        if next_word.endswith(('.', '!', '?')):
            break
    return ' '.join(sentence)

# Take starting words from the user
starting_words = input("Enter the starting words (separate words with spaces): ").lower().split()

# Test if the starting words exist in the corpus
if tuple(starting_words) not in [(first_word, second_word) for first_word, second_word, word in tri_grams]:
    print("Starting words not found in the corpus.")
    exit()

# Test the model by generating a new sentencea
new_sentence = generate_sentence_tri_gram(tri_grams, starting_words)
print("Generated Sentence:")
print(new_sentence)


Enter the starting words (separate words with spaces): automotive industry
Generated Sentence:
automotive industry is undergoing rapid advancements in electric vehicles.


In [None]:
# Define the corpus of car-related sentences
car_sentences = [
    "The automotive industry is undergoing rapid advancements in electric vehicles.",
    "Many car manufacturers are investing heavily in autonomous driving technology.",
    "Fuel efficiency has become a key focus for consumers when purchasing new vehicles.",
    "Classic cars hold a special place in the hearts of many enthusiasts.",
    "Safety features such as automatic emergency braking and adaptive cruise control are becoming standard in modern cars.",
]


# Create trigrams
tri_grams = list(trigrams(corpus))
tri_grams

[('the', 'automotive', 'industry'),
 ('automotive', 'industry', 'is'),
 ('industry', 'is', 'undergoing'),
 ('is', 'undergoing', 'rapid'),
 ('undergoing', 'rapid', 'advancements'),
 ('rapid', 'advancements', 'in'),
 ('advancements', 'in', 'electric'),
 ('in', 'electric', 'vehicles.'),
 ('electric', 'vehicles.', 'many'),
 ('vehicles.', 'many', 'car'),
 ('many', 'car', 'manufacturers'),
 ('car', 'manufacturers', 'are'),
 ('manufacturers', 'are', 'investing'),
 ('are', 'investing', 'heavily'),
 ('investing', 'heavily', 'in'),
 ('heavily', 'in', 'autonomous'),
 ('in', 'autonomous', 'driving'),
 ('autonomous', 'driving', 'technology.'),
 ('driving', 'technology.', 'fuel'),
 ('technology.', 'fuel', 'efficiency'),
 ('fuel', 'efficiency', 'has'),
 ('efficiency', 'has', 'become'),
 ('has', 'become', 'a'),
 ('become', 'a', 'key'),
 ('a', 'key', 'focus'),
 ('key', 'focus', 'for'),
 ('focus', 'for', 'consumers'),
 ('for', 'consumers', 'when'),
 ('consumers', 'when', 'purchasing'),
 ('when', 'purcha