# Script to normilize Description

# Install Packages

In [None]:
!pip install pandas scikit-learn nltk

# Load Packages

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## Download required NLTK models and corpora

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to normalize and standardize text

In [3]:
def preprocess_text(text):
    
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize text
    word_tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in word_tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return ' '.join(lemmatized_text)

# Load data

In [None]:
filename = 'xxxxxxxx.csv'  # replace with your file
df = pd.read_csv(filename)

# Assuming the column name with task descriptions is 'task_description'

In [None]:
df['cleaned_description'] = df['DescriptionColumnName'].apply(preprocess_text) # replace by column name to receive normalization

# Use TF-IDF for text vectorization

In [25]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_description'])

# Use KMeans clustering to find common task descriptions

In [None]:
num_clusters = 50  # Change this to the number of desired distinct descriptions
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(tfidf_matrix)

# Assign cluster labels

In [27]:
df['cluster'] = kmeans.labels_

# Display most representative task description for each cluster

In [29]:
cluster_representatives = {}

for cluster in range(num_clusters):
    cluster_indices = df[df['cluster'] == cluster].index
    representative_text = df.loc[cluster_indices, 'TaskDescription'].values[0]
    cluster_representatives[cluster] = representative_text

# Save results

In [30]:
output_filename = 'normalized_tasks.csv'
df.to_csv(output_filename, index=False)

# print results

In [None]:
print(f"Cluster representatives: {cluster_representatives}")
print(f"Normalized tasks are saved to {output_filename}")

# TEST AREA

In [None]:
df.describe()

In [None]:
df.info()