# Task Management Project Analysis

This notebook contains the analysis of task management data from Jira, including data loading, exploration, cleaning, and machine learning tasks.

## Week 1: Data Loading and Exploration

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

### 1. Loading and Initial Data Exploration

In [None]:
# Load the dataset
df = pd.read_csv('jira_dataset.csv')

# Display first few rows
print("First few rows of the dataset:")
display(df.head())

# Display data types
print("\nData types of columns:")
display(df.dtypes)

# Display summary statistics
print("\nSummary statistics:")
display(df.describe())

### 2. Exploratory Data Analysis

In [None]:
# Plot task priority distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='priority')
plt.title('Distribution of Task Priorities')
plt.xticks(rotation=45)
plt.show()

# Check for missing values
print("\nMissing values in each column:")
display(df.isnull().sum())

# Check for duplicates
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")

### 3. Data Cleaning

In [None]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Handle missing values
df = df.fillna({'description': '', 'priority': 'Medium'})

# Remove duplicates
df = df.drop_duplicates()

print("Dataset shape after cleaning:", df.shape)

### 4. Text Preprocessing

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to description column
df['clean_description'] = df['project_description'].apply(preprocess_text)

# Display sample of original and cleaned descriptions
print("Sample of original and cleaned descriptions:")
display(pd.DataFrame({
    'Original': df['project_description'].head(),
    'Cleaned': df['clean_description'].head()
}))

## Week 2: Feature Engineering and Model Training

### 6. Text Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf.fit_transform(df['clean_description'])

# Word2Vec Vectorization
sentences = [text.split() for text in df['clean_description']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to get document vectors using Word2Vec
def get_document_vector(text):
    words = text.split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    return np.zeros(100)

word2vec_features = np.array([get_document_vector(text) for text in df['clean_description']])

print("TF-IDF features shape:", tfidf_features.shape)
print("Word2Vec features shape:", word2vec_features.shape)

### 7. Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder

# Prepare target variable
le = LabelEncoder()
y = le.fit_transform(df['priority'])

# Split data for TF-IDF features
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    tfidf_features, y, test_size=0.2, random_state=42
)

# Split data for Word2Vec features
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    word2vec_features, y, test_size=0.2, random_state=42
)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

### 8. Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    print(f'
{model_name} Results:')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.3f}')
    print(f'Precision: {precision_score(y_test, y_pred, average='weighted'):.3f}')
    print(f'Recall: {recall_score(y_test, y_pred, average='weighted'):.3f}')
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Evaluate both models
evaluate_model(nb_model, X_test_tfidf, y_test, 'Naive Bayes')
evaluate_model(svm_model, X_test_tfidf, y_test, 'SVM')