In [7]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Danya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Danya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Danya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Danya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Define the path to the input and output files
input_file = "./dataset/g02-federalspending.txt"
output_file = "./datasetCSV/g02-federalspending.csv"

# Read the lines from the input file
with open(input_file, 'r') as file:
    lines = file.readlines()

# Strip leading/trailing whitespace and filter out empty lines
lines = [line.strip() for line in lines if line.strip()]

# Create a DataFrame with a single column 'UserStory'
df = pd.DataFrame(lines, columns=['UserStory'])

# Save the DataFrame to a CSV file
df.to_csv(output_file, index=False)

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define text processing functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def sentence_length(text):
    sentences = sent_tokenize(text)
    return np.mean([len(sent.split()) for sent in sentences])

def count_adjectives_adverbs(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    adjectives = sum(1 for word, pos in pos_tags if pos in ('JJ', 'JJR', 'JJS'))
    adverbs = sum(1 for word, pos in pos_tags if pos in ('RB', 'RBR', 'RBS'))
    return adjectives, adverbs

# Apply text processing
df['cleaned_user_story'] = df['UserStory'].apply(clean_text)
df['filtered_user_story'] = df['cleaned_user_story'].apply(remove_stopwords)
df['processed_user_story'] = df['filtered_user_story'].apply(lemmatize_text)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed_user_story'])

# Extract additional features
df['sentence_lengths'] = df['processed_user_story'].apply(sentence_length)
df[['adjectives', 'adverbs']] = df['processed_user_story'].apply(lambda x: pd.Series(count_adjectives_adverbs(x)))

# Combine TF-IDF and additional features
tfidf_matrix = tfidf_matrix.toarray()
additional_features = df[['sentence_lengths', 'adjectives', 'adverbs']].values
X = np.hstack([tfidf_matrix, additional_features])

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.1, min_samples=20)
df['dbscan_cluster'] = dbscan.fit_predict(X_scaled)

# Filter out noise points and check for enough non-noise points
non_noise_data = df[df['dbscan_cluster'] != -1]

if len(non_noise_data) > 1:
    # Evaluate clustering performance
    silhouette_avg_dbscan = silhouette_score(X_scaled[non_noise_data.index], non_noise_data['dbscan_cluster'])
    print(f"DBSCAN Silhouette Score: {silhouette_avg_dbscan}")
else:
    print("Not enough non-noise points for silhouette score calculation.")

# Perform PCA for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2'])

# Plot DBSCAN clustering results
pca_df['DBSCAN_Cluster'] = df['dbscan_cluster']
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='DBSCAN_Cluster', palette='viridis', s=100, alpha=0.7)
plt.title(f'DBSCAN Clustering (PCA-reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='DBSCAN Cluster')
plt.show()

# Save DBSCAN results
final_dbscan_output_file = './datasetCSV/processed_data_with_dbscan_clusters.csv'
df.to_csv(final_dbscan_output_file, index=False)
print(f"DBSCAN clustering completed. Data with clusters saved to {final_dbscan_output_file}")


NameError: name 'pos_tag' is not defined

In [35]:
import os
import pandas as pd

def read_text_files(folder_path):
  """
  Generator to read text files from a folder, removing quotation marks from the start and end.
  """
  for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
      file_path = os.path.join(folder_path, filename)
      try:
        with open(file_path, 'r', encoding='utf-8') as file:
          content = file.read().strip().lstrip('"').rstrip('"')  # Remove leading and trailing quotes
      except UnicodeDecodeError:
        # Fallback to a different encoding if utf-8 fails
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
          content = file.read().strip().lstrip('"').rstrip('"')  # Remove leading and trailing quotes
      yield content

def combine_files_to_csv(folder_path, output_csv):
  """
  Combine text files from a folder into a single CSV file without a header.
  """
  data = []
  for file_content in read_text_files(folder_path):
    data.append(file_content)

  # Create a DataFrame from the combined data without a header
  df = pd.DataFrame(data)

  # Save DataFrame to CSV without index or header
  df.to_csv(output_csv, index=False, header=False)

  print(f'All text files have been combined into {output_csv}')

# Specify the folder containing text files and the output CSV path
input_folder = './dataset/'  # Replace with your folder path
output_csv = './labelledData/combined_data.txt'  # Replace with your desired CSV path

combine_files_to_csv(input_folder, output_csv)

All text files have been combined into ./labelledData/combined_data.txt


In [37]:
import csv

# Define input and output files
input_file = './labelledData/combined_data.txt'
output_file = './labelledData/output.csv'

# Open the input file and output file
with open(input_file, 'r') as txt_file, open(output_file, 'w', newline='') as csv_file:
    # Create a CSV writer object
    csv_writer = csv.writer(csv_file)
    
    # Iterate over each line in the text file
    for line in txt_file:
        # Split the line into fields based on a delimiter (e.g., tab or comma)
        fields = line.strip().split('\t')  # Change delimiter if needed
        # Write the fields to the CSV file
        csv_writer.writerow(fields)

print("Conversion complete.")

Conversion complete.
