In [36]:
import speech_recognition as sr
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [37]:
# Fetch dataset
newsgroups = fetch_20newsgroups()
texts = newsgroups.data
categories = newsgroups.target

In [39]:
# Create a DataFrame for easy manipulation
df_texts = pd.DataFrame({'text': texts, 'category': categories})

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Normalize text
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenization
    tokens = text.split()
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

# Apply preprocessing
df_texts['cleaned_text'] = df_texts['text'].apply(preprocess_text)

# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_texts['cleaned_text'], df_texts['category'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(train_texts)
X_test_text = vectorizer.transform(test_texts)

# Scale the data
scaler = StandardScaler(with_mean=False)  # with_mean=False because sparse matrices don't support mean centering
X_train_text = scaler.fit_transform(X_train_text)
X_test_text = scaler.transform(X_test_text)


In [40]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_text, train_labels)

# Make predictions on the test set
predictions = model.predict(X_test_text)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Print a detailed classification report
print(classification_report(test_labels, predictions, target_names=newsgroups.target_names))


Model Accuracy: 90.46%
                          precision    recall  f1-score   support

             alt.atheism       0.87      0.93      0.90        97
           comp.graphics       0.77      0.80      0.78       104
 comp.os.ms-windows.misc       0.84      0.83      0.84       115
comp.sys.ibm.pc.hardware       0.75      0.80      0.77       123
   comp.sys.mac.hardware       0.90      0.90      0.90       126
          comp.windows.x       0.90      0.90      0.90       106
            misc.forsale       0.81      0.83      0.82       109
               rec.autos       0.95      0.89      0.92       139
         rec.motorcycles       0.97      0.96      0.96       122
      rec.sport.baseball       0.97      0.97      0.97       102
        rec.sport.hockey       0.97      0.96      0.97       108
               sci.crypt       1.00      0.94      0.97       125
         sci.electronics       0.87      0.85      0.86       114
                 sci.med       0.97      0.97      0

In [41]:
# Function to recognize speech from microphone
def recognize_speech():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("Please say something...")
        audio = recognizer.listen(source)
        try:
            print("Recognizing...")
            text = recognizer.recognize_google(audio)
            print(f"You said: {text}")
            return text
        except sr.UnknownValueError:
            print("Google Web Speech API could not understand the audio.")
        except sr.RequestError as e:
            print(f"Could not request results from Google Web Speech API; {e}")
            return None

In [42]:
# Function to get input from text or microphone and predict category
def get_input_and_predict():
    input_method = input("Do you want to input text or use the microphone? (text/mic): ").strip().lower()
    
    if input_method == 'text':
        text = input("Please enter your text: ").strip()
    elif input_method == 'mic':
        text = recognize_speech()
        if text is None:
            print("Could not recognize speech. Please try again.")
            return
    else:
        print("Invalid input method. Please choose 'text' or 'mic'.")
        return
    
    # Preprocess the text
    processed_text = preprocess_text(text)

    # Vectorize the text
    vectorized_text = vectorizer.transform([processed_text])

    # Predict the category
    prediction = model.predict(vectorized_text)
    category = newsgroups.target_names[prediction[0]]
    print(f"Predicted Category: {category}")

In [46]:
get_input_and_predict()

Do you want to input text or use the microphone? (text/mic):  text
Please enter your text:  i have to go to school


Predicted Category: misc.forsale


In [47]:
get_input_and_predict()

Do you want to input text or use the microphone? (text/mic):  mic


Please say something...
Recognizing...
You said: I have meeting today
Predicted Category: misc.forsale
