In [2]:
pip install svector

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
pip install time

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement time (from versions: none)

[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for time


In [None]:
#PART 1

In [51]:
pip install csv

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement csv (from versions: none)

[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for csv


In [1]:
import pandas as pd
import time
from svector import svector 
import csv

def read_from(textfile):
    data = pd.read_csv(textfile)
    for i in range(len(data)):
        id, words, label = data.iloc[i]
        yield (1 if label=="+" else -1, words.split())

In [2]:
def make_vector(words):
    v = svector()
    for word in words:
        v[word] += 1
    return v

In [3]:
#adding bias in this section
def make_vector1(words):
    v = svector()
    for word in words:
        v[word] += 1
    # This line adds the bias term to every vector.
    v['<bias>'] = 1
    return v

In [4]:
def test(devfile, model):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1): # note 1...|D|
        err += label * (model.dot(make_vector(words))) <= 0
    return err/i 

In [5]:
#with make_vector bias added
def test1(devfile, model):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1): # note 1...|D|
        err += label * (model.dot(make_vector1(words))) <= 0
    return err/i 

In [6]:

def train(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    model = svector()
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector(words)
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
        dev_err = test(devfile, model)
        best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))

In [7]:
#with make_vector bias added
def train1(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    model = svector()
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector1(words)
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
        dev_err = test1(devfile, model)
        if dev_err < best_err:
            best_err = dev_err
            best_model = model.copy()
        
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    
    return best_model 

In [8]:
perceptron_model1 = train('train.csv', 'dev.csv')

epoch 1, update 38.8%, dev 36.6%
epoch 2, update 25.1%, dev 34.6%
epoch 3, update 20.7%, dev 33.8%
epoch 4, update 16.7%, dev 31.7%
epoch 5, update 13.8%, dev 34.0%
best dev err 31.7%, |w|=16743, time: 2.2 secs


In [9]:
perceptron_model2 = train1('train.csv', 'dev.csv')

epoch 1, update 39.0%, dev 39.6%
epoch 2, update 25.5%, dev 34.1%
epoch 3, update 20.8%, dev 35.3%
epoch 4, update 17.2%, dev 35.5%
epoch 5, update 14.1%, dev 28.9%
best dev err 28.9%, |w|=16744, time: 2.2 secs


In [None]:
def generate_predictions(test_filepath, model, output_filepath):

    test_data = pd.read_csv(test_filepath)
    # Here it will generate predictions for test_filepath


    with open(output_filepath, "w") as f:
        f.write("id,sentence,target\n")
        for i in range(len(test_data)):
            id_val, sentence_val, _ = test_data.iloc[i]
            words = sentence_val.split()
            # Using the same make_vector function that was used during training
            vector = make_vector1(words)
            # Predict '+' if the score is > 0, otherwise '-'
            prediction = '+' if model.dot(vector) > 0 else '-'
            
            f.write(f'{id_val},"{sentence_val}",{prediction}\n')
            
    print(f"Submission file saved as '{output_filepath}'.")


In [11]:
generate_predictions('test.csv', perceptron_model2, 'submission_perceptron.csv')


Generating predictions for 'test.csv'...
Submission file saved as 'submission_perceptron.csv'.


In [None]:
#PART 2

In [None]:
#Q1
def train_average(trainfile, devfile, epochs=10):
    t = time.time()
    best_err = 1.
    model = svector()
    best_model = svector() 
  
    # `cached_model` will store the sum of all updates, scaled by when they happened.
    cached_model = svector()
    # `c` is counter for the total number of examples seen.
    c = 1.0 # Using a float to make division easier 

    print("\n Training Averaged Perceptron (with Bias)")
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): 
            sent = make_vector1(words) 
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                
                #SMART AVERAGING UPDATE 
                cached_model += c * label * sent
            c += 1

        current_averaged_model = model - (1/c) * cached_model
        dev_err = test1(devfile, current_averaged_model)
        
        if dev_err < best_err:
            best_err = dev_err
            # saving the averaged model, if in case this is the best error 
            best_model = current_averaged_model.copy()
        
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(best_model), time.time() - t))
    
    return best_model

In [13]:
perceptron_model3 = train_average('train.csv', 'dev.csv')


 Training Averaged Perceptron (with Bias)
epoch 1, update 39.0%, dev 31.4%
epoch 2, update 25.5%, dev 27.7%
epoch 3, update 20.8%, dev 27.2%
epoch 4, update 17.2%, dev 27.6%
epoch 5, update 14.1%, dev 27.2%
epoch 6, update 12.2%, dev 26.7%
epoch 7, update 10.5%, dev 26.3%
epoch 8, update 9.7%, dev 26.4%
epoch 9, update 7.8%, dev 26.3%
epoch 10, update 6.9%, dev 26.3%
best dev err 26.3%, |w|=16744, time: 4.6 secs


In [14]:
#PART 2 Question 3
#Prints the top 20 positive sen
def get_pos_features(model, dev_file):
    sentence_score = []
    dev_data = pd.read_csv(dev_file)

    for i in range(len(dev_data)):
        _id, sentence, _ = dev_data.iloc[i]
        words = sentence.split()
        vector = make_vector1(words) 
        score = model.dot(vector)
        sentence_score.append({'sentence': sentence, 'score': score})
    sorted_sentences = sorted(sentence_score, key=lambda item: item['score'], reverse=True)
    
    print("\nTop 20 Positively Scored Sentences in Dev Data based on my perceptron model: ")
    for item in sorted_sentences[:20]:
        print(f"Score: {item['score']:.4f} | Sentence: {item['sentence']}")
    

In [15]:
top_pos= get_pos_features(perceptron_model3,'dev.csv')


Top 20 Positively Scored Sentences in Dev Data based on my perceptron model: 
Score: 40.8533 | Sentence: witty dialog between realistic characters showing honest emotions it 's touching and tender and proves that even in sorrow you can find humor like blended shades of lipstick , these components combine into one terrific story with lots of laughs
Score: 39.8136 | Sentence: both a beautifully made nature film and a tribute to a woman whose passion for this region and its inhabitants still shines in her quiet blue eyes
Score: 38.3537 | Sentence: a delightfully unpredictable , hilarious comedy with wonderful performances that tug at your heart in ways that utterly transcend gender labels
Score: 37.5523 | Sentence: a journey spanning nearly three decades of bittersweet camaraderie and history , in which we feel that we truly know what makes holly and marina tick , and our hearts go out to them as both continue to negotiate their imperfect , love hate relationship
Score: 37.5249 | Sentenc

In [None]:
def save_pos_sen_to_csv(model, dev_file, output_filepath):
    sentence_score = []
    dev_data = pd.read_csv(dev_file)

    for i in range(len(dev_data)):
        _id, sentence, _ = dev_data.iloc[i]
        words = sentence.split()
        vector = make_vector1(words) 
        score = model.dot(vector)
        sentence_score.append({'sentence': sentence, 'score': score})
        
    sorted_sentences = sorted(sentence_score, key=lambda item: item['score'], reverse=True)

   # Here it will generate predictions for output_filepath

    with open(output_filepath, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['score', 'sentence']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for item in sorted_sentences[:60]:
            writer.writerow({'score': f"{item['score']:.4f}", 'sentence': item['sentence']})
    print(f"Submission file saved as '{output_filepath}'.")

In [17]:
save_pos_sen_to_csv(perceptron_model3,'dev.csv','most_pos_sentences.csv')


Saving all positively sorted sentences to 'most_pos_sentences.csv'...
Submission file saved as 'most_pos_sentences.csv'.


In [18]:
#Prints the top 20 negative sentences
def get_neg_features(model,dev_file): 

    sentence_score = []
    dev_data = pd.read_csv(dev_file)

    for i in range(len(dev_data)):
        _id, sentence, _ = dev_data.iloc[i]
        words = sentence.split()
        vector = make_vector1(words)
        score = model.dot(vector)
        sentence_score.append({'sentence': sentence, 'score': score})
    sorted_sentences = sorted(sentence_score, key=lambda item: item['score'])

    print("\nTop 20 Most Negatively Scored Sentences in Dev Data based on my perceptron model")
    for item in sorted_sentences[:20]:
        print(f"Score: {item['score']:.4f} | Sentence: {item['sentence']}")

In [19]:
top_neg= get_neg_features(perceptron_model3,'dev.csv')


Top 20 Most Negatively Scored Sentences in Dev Data based on my perceptron model
Score: -55.3830 | Sentence: it 's not too fast and not too slow it 's not too racy and it 's not too offensive it 's not too much of anything
Score: -50.1244 | Sentence: the script was reportedly rewritten a dozen times either 11 times too many or else too few
Score: -43.6594 | Sentence: hawke 's film , a boring , pretentious waste of nearly two hours , does n't tell you anything except that the chelsea hotel today is populated by whiny , pathetic , starving and untalented artistes
Score: -42.0801 | Sentence: the script feels as if it started to explore the obvious voyeuristic potential of ` hypertime ' but then backed off when the producers saw the grosses for spy kids
Score: -41.1777 | Sentence: the thing about guys like evans is this you 're never quite sure where self promotion ends and the truth begins but as you watch the movie , you 're too interested to care
Score: -38.5063 | Sentence: resurrectio

In [None]:
def save_neg_to_csv(model, dev_file, output_filepath):
    sentence_score = []
    dev_data = pd.read_csv(dev_file)

    for i in range(len(dev_data)):
        _id, sentence, _ = dev_data.iloc[i]
        words = sentence.split()
        vector = make_vector1(words) 
        score = model.dot(vector)
        sentence_score.append({'sentence': sentence, 'score': score})
        
    sorted_sentences = sorted(sentence_score, key=lambda item: item['score'])

    # Here it will generate predictions for output_filepath

    with open(output_filepath, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['score', 'sentence']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for item in sorted_sentences[:60]:
            writer.writerow({'score': f"{item['score']:.4f}", 'sentence': item['sentence']})
    print(f"Submission file saved as '{output_filepath}'.")

In [21]:
save_neg_to_csv(perceptron_model3,'dev.csv','most_neg_sentences.csv')


Saving all negitively sorted sentences to 'most_neg_sentences.csv'...
Submission file saved as 'most_neg_sentences.csv'.


In [None]:
#testing my perceptron_model3 on test dataset
def generate_prediction_avg(test_filepath, model, output_filepath):
    test_data = pd.read_csv(test_filepath)
   # Here it will generate predictions for test_filepath


    with open(output_filepath, "w", newline='', encoding='utf-8') as f:
        # Write the required CSV header
        f.write("id,sentence,target\n")
        
        for i in range(len(test_data)):
            id_val, sentence_val, _ = test_data.iloc[i]
            words = sentence_val.split()
            vector = make_vector1(words)
            prediction = '+' if model.dot(vector) > 0 else '-'
            f.write(f'{id_val},"{sentence_val}",{prediction}\n')
            
    print(f"Submission file saved as '{output_filepath}'.")

In [24]:
generate_prediction_avg('test.csv', perceptron_model3, 'submission_perceptron.csv')


Generating predictions for 'test.csv'...
Submission file saved as 'submission_perceptron.csv'.


In [33]:
#PART 3
# Creating vector-creation functin for training the pruned data 
def make_vectorp(words, vocab=None):
    v = svector()
    for word in words:
        if vocab is None or word in vocab:
            v[word] += 1
    v['<bias>'] = 1
    return v

In [34]:
# Creating testing function for pruned data 
def testp(devfile, model, vocab=None):
    err = 0
    total = 0
    for label, words in read_from(devfile):
        total += 1
        vector = make_vectorp(words, vocab)
        err += label * (model.dot(vector)) <= 0
    return err / total


In [35]:
def train_average_pruned(trainfile, devfile, epochs=10, prune_threshold=1):
    t_start = time.time()
    
   # Building vocabulary by pruning words with count <= 1 as I have set in prune_threshold
    vocab_counts = {}
    for _, words in read_from(trainfile):
        for word in words:
            vocab_counts[word] = vocab_counts.get(word, 0) + 1
    pruned_vocab = {word for word, count in vocab_counts.items() if count > prune_threshold}
    print(f"Original vocab size: {len(vocab_counts)}, Pruned vocab size: {len(pruned_vocab)}")
    
    # Train using existing functions, now passing pruned_vocab dictionary
    best_err = 1.
    model = svector()
    cached_model = svector()
    c = 1.0
    best_model = svector()

    # Training Averaged Perceptron using Pruned data
    for z in range(1, epochs + 1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1):
            sent = make_vectorp(words, pruned_vocab)
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                cached_model += c * label * sent
            c += 1

        current_averaged_model = model - (1/c) * cached_model

    #Calculating the dev error now
        dev_err = testp(devfile, current_averaged_model, pruned_vocab)
        if dev_err < best_err:
            best_err = dev_err
            best_model = current_averaged_model.copy()
        
        print("epoch %d, update %.1f%%, dev %.1f%%" % (z, updates / i * 100, dev_err * 100))
        
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(best_model), time.time() - t_start))
    
    return best_model, pruned_vocab

In [36]:
Prune_model , vocab_1 = train_average_pruned('train.csv', 'dev.csv')

Original vocab size: 15805, Pruned vocab size: 8424
epoch 1, update 39.0%, dev 31.6%
epoch 2, update 26.4%, dev 27.5%
epoch 3, update 22.8%, dev 26.8%
epoch 4, update 18.8%, dev 26.6%
epoch 5, update 17.2%, dev 25.9%
epoch 6, update 14.8%, dev 26.5%
epoch 7, update 13.2%, dev 27.0%
epoch 8, update 12.7%, dev 26.7%
epoch 9, update 11.4%, dev 26.6%
epoch 10, update 10.6%, dev 26.2%
best dev err 25.9%, |w|=8425, time: 4.9 secs
