### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Removing Stopwords

**Steps**:
1. Data Set: Use a dataset of text product descriptions.
2. Stopword Removal: Utilize an NLP library (e.g., NLTK) to remove stopwords from the
descriptions.
3. Assess Impact: Examine the effectiveness by analyzing word frequency before and after
removal.

In [None]:
# write your code from here

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def remove_stopwords_from_descriptions(df, text_column):
    stop_words = set(stopwords.words('english'))
    descriptions = df[text_column].astype(str)
    filtered_descriptions = []
    for desc in descriptions:
        word_tokens = word_tokenize(desc.lower())
        filtered_words = [w for w in word_tokens if not w in stop_words]
        filtered_descriptions.append(" ".join(filtered_words))
    df['cleaned_description'] = filtered_descriptions
    return df

def analyze_word_frequency(texts):
    all_words = []
    for text in texts:
        word_tokens = word_tokenize(text.lower())
        all_words.extend(word_tokens)
    return Counter(all_words)

if __name__ == '__main__':
    data = {'ProductID': [1, 2, 3],
            'Description': ["The best product in the market with great features.",
                            "A high-quality item, very useful and effective.",
                            "This is an amazing product; you should buy it now!"]}
    product_df = pd.DataFrame(data)
    text_column = 'Description'

    print("Original Word Frequencies:")
    original_frequencies = analyze_word_frequency(product_df[text_column].astype(str).tolist())
    print(original_frequencies.most_common(10))

    df_cleaned = remove_stopwords_from_descriptions(product_df.copy(), text_column)

    print("\nCleaned Word Frequencies:")
    cleaned_frequencies = analyze_word_frequency(df_cleaned['cleaned_description'].tolist())
    print(cleaned_frequencies.most_common(10))

    print("\nDataFrame with Cleaned Descriptions:")
    print(df_cleaned)